python-igraph manual

For using igraph from Python

   Home       Trees       Indices       Help   
Package igraph :: Module statistics
[hide private]

Source Code for Module igraph.statistics

  1  # vim:ts=4:sw=4:sts=4:et 
  2  # -*- coding: utf-8 -*- 
  3  """ 
  4  Statistics related stuff in igraph 
  5  """ 
  6   
  7  __license__ = u"""\ 
  8  Copyright (C) 2006-2012  Tamas Nepusz <ntamas@gmail.com> 
  9  Pázmány Péter sétány 1/a, 1117 Budapest, Hungary 
 10   
 11  This program is free software; you can redistribute it and/or modify 
 12  it under the terms of the GNU General Public License as published by 
 13  the Free Software Foundation; either version 2 of the License, or 
 14  (at your option) any later version. 
 15   
 16  This program is distributed in the hope that it will be useful, 
 17  but WITHOUT ANY WARRANTY; without even the implied warranty of 
 18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 19  GNU General Public License for more details. 
 20   
 21  You should have received a copy of the GNU General Public License 
 22  along with this program; if not, write to the Free Software 
 23  Foundation, Inc.,  51 Franklin Street, Fifth Floor, Boston, MA  
 24  02110-1301 USA 
 25  """ 
 26   
 27  import math 
 28   
 29  __all__ = ["FittedPowerLaw", "Histogram", "RunningMean", "mean", "median", \ 
 30          "percentile", "quantile", "power_law_fit"] 
31 32 33 -class FittedPowerLaw(object):
34 """Result of fitting a power-law to a vector of samples 35 36 Example: 37 38 >>> result = power_law_fit([1, 2, 3, 4, 5, 6]) 39 >>> result # doctest:+ELLIPSIS 40 FittedPowerLaw(continuous=False, alpha=2.425828..., xmin=3.0, L=-7.54633..., D=0.2138..., p=0.99311...) 41 >>> print result # doctest:+ELLIPSIS 42 Fitted power-law distribution on discrete data 43 <BLANKLINE> 44 Exponent (alpha) = 2.425828 45 Cutoff (xmin) = 3.000000 46 <BLANKLINE> 47 Log-likelihood = -7.546337 48 <BLANKLINE> 49 H0: data was drawn from the fitted distribution 50 <BLANKLINE> 51 KS test statistic = 0.213817 52 p-value = 0.993111 53 <BLANKLINE> 54 H0 could not be rejected at significance level 0.05 55 >>> result.alpha # doctest:+ELLIPSIS 56 2.425828... 57 >>> result.xmin 58 3.0 59 >>> result.continuous 60 False 61 """ 62
63 - def __init__(self, continuous, alpha, xmin, L, D, p):
64 self.continuous = continuous 65 self.xmin = xmin 66 self.alpha = alpha 67 self.L = L 68 self.D = D 69 self.p = p 70
71 - def __repr__(self):
72 return "%s(continuous=%r, alpha=%r, xmin=%r, L=%r, D=%r, p=%r)" % \ 73 (self.__class__.__name__, self.continuous, self.alpha, \ 74 self.xmin, self.L, self.D, self.p) 75
76 - def __str__(self):
77 return self.summary(significance=0.05) 78
79 - def summary(self, significance=0.05):
80 """Returns the summary of the power law fit. 81 82 @param significance: the significance level of the Kolmogorov-Smirnov test 83 used to decide whether the input data could have come from the fitted 84 distribution 85 @return: the summary as a string 86 """ 87 result = ["Fitted power-law distribution on %s data" % \ 88 ("discrete", "continuous")[bool(self.continuous)]] 89 result.append("") 90 result.append("Exponent (alpha) = %f" % self.alpha) 91 result.append("Cutoff (xmin) = %f" % self.xmin) 92 result.append("") 93 result.append("Log-likelihood = %f" % self.L) 94 result.append("") 95 result.append("H0: data was drawn from the fitted distribution") 96 result.append("") 97 result.append("KS test statistic = %f" % self.D) 98 result.append("p-value = %f" % self.p) 99 result.append("") 100 if self.p < significance: 101 result.append("H0 rejected at significance level %g" \ 102 % significance) 103 else: 104 result.append("H0 could not be rejected at significance "\ 105 "level %g" % significance) 106 107 return "\n".join(result)
108
109 110 -class Histogram(object):
111 """Generic histogram class for real numbers 112 113 Example: 114 115 >>> h = Histogram(5) # Initializing, bin width = 5 116 >>> h << [2,3,2,7,8,5,5,0,7,9] # Adding more items 117 >>> print h 118 N = 10, mean +- sd: 4.8000 +- 2.9740 119 [ 0, 5): **** (4) 120 [ 5, 10): ****** (6) 121 """ 122
123 - def __init__(self, bin_width = 1, data = None):
124 """Initializes the histogram with the given data set. 125 126 @param bin_width: the bin width of the histogram. 127 @param data: the data set to be used. Must contain real numbers. 128 """ 129 self._bin_width = float(bin_width) 130 self._bins = None 131 self._min, self._max = None, None 132 self._running_mean = RunningMean() 133 self.clear() 134 135 if data: 136 self.add_many(data) 137
138 - def _get_bin(self, num, create = False):
139 """Returns the bin index corresponding to the given number. 140 141 @param num: the number for which the bin is being sought 142 @param create: whether to create a new bin if no bin exists yet. 143 @return: the index of the bin or C{None} if no bin exists yet and 144 {create} is C{False}.""" 145 if len(self._bins) == 0: 146 if not create: 147 result = None 148 else: 149 self._min = int(num/self._bin_width)*self._bin_width 150 self._max = self._min+self._bin_width 151 self._bins = [0] 152 result = 0 153 return result 154 155 if num >= self._min: 156 binidx = int((num-self._min)/self._bin_width) 157 if binidx < len(self._bins): 158 return binidx 159 if not create: 160 return None 161 extra_bins = binidx-len(self._bins)+1 162 self._bins.extend([0]*extra_bins) 163 self._max = self._min + len(self._bins)*self._bin_width 164 return binidx 165 166 if not create: 167 return None 168 169 extra_bins = int(math.ceil((self._min-num)/self._bin_width)) 170 self._bins[0:0] = [0]*extra_bins 171 self._min -= extra_bins*self._bin_width 172 self._max = self._min + len(self._bins)*self._bin_width 173 return 0 174 175 @property
176 - def n(self):
177 """Returns the number of elements in the histogram""" 178 return len(self._running_mean) 179 180 @property
181 - def mean(self):
182 """Returns the mean of the elements in the histogram""" 183 return self._running_mean.mean 184 185 # pylint: disable-msg=C0103 186 @property
187 - def sd(self):
188 """Returns the standard deviation of the elements in 189 the histogram""" 190 return self._running_mean.sd 191 192 @property
193 - def var(self):
194 """Returns the variance of the elements in the histogram""" 195 return self._running_mean.var 196
197 - def add(self, num, repeat=1):
198 """Adds a single number to the histogram. 199 200 @param num: the number to be added 201 @param repeat: number of repeated additions 202 """ 203 num = float(num) 204 binidx = self._get_bin(num, True) 205 self._bins[binidx] += repeat 206 self._running_mean.add(num, repeat) 207
208 - def add_many(self, data):
209 """Adds a single number or the elements of an iterable to the histogram. 210 211 @param data: the data to be added""" 212 try: 213 iterator = iter(data) 214 except TypeError: 215 iterator = iter([data]) 216 for x in iterator: 217 self.add(x) 218 __lshift__ = add_many 219
220 - def clear(self):
221 """Clears the collected data""" 222 self._bins = [] 223 self._min, self._max = None, None 224 self._running_mean = RunningMean() 225
226 - def bins(self):
227 """Generator returning the bins of the histogram in increasing order 228 229 @return: a tuple with the following elements: left bound, right bound, 230 number of elements in the bin""" 231 x = self._min 232 for elem in self._bins: 233 yield (x, x+self._bin_width, elem) 234 x += self._bin_width 235
236 - def __plot__(self, context, bbox, _, **kwds):
237 """Plotting support""" 238 from igraph.drawing.coord import DescartesCoordinateSystem 239 coord_system = DescartesCoordinateSystem(context, bbox, \ 240 (kwds.get("min", self._min), 0, \ 241 kwds.get("max", self._max), kwds.get("max_value", max(self._bins)) 242 )) 243 244 # Draw the boxes 245 context.set_line_width(1) 246 context.set_source_rgb(1., 0., 0.) 247 x = self._min 248 for value in self._bins: 249 top_left_x, top_left_y = coord_system.local_to_context(x, value) 250 x += self._bin_width 251 bottom_right_x, bottom_right_y = coord_system.local_to_context(x, 0) 252 context.rectangle(top_left_x, top_left_y, \ 253 bottom_right_x - top_left_x, \ 254 bottom_right_y - top_left_y) 255 context.fill() 256 257 # Draw the axes 258 coord_system.draw() 259
260 - def to_string(self, max_width=78, show_bars=True, show_counts=True):
261 """Returns the string representation of the histogram. 262 263 @param max_width: the maximal width of each line of the string 264 This value may not be obeyed if it is too small. 265 @param show_bars: specify whether the histogram bars should be shown 266 @param show_counts: specify whether the histogram counts should be 267 shown. If both I{show_bars} and I{show_counts} are C{False}, 268 only a general descriptive statistics (number of elements, mean and 269 standard deviation) is shown. 270 """ 271 272 if self._min is None or self._max is None: 273 return "N = 0" 274 275 # Determine how many decimal digits should we use 276 if int(self._min) == self._min and int(self._bin_width) == self._bin_width: 277 number_format = "%d" 278 else: 279 number_format = "%.3f" 280 num_length = max(len(number_format % self._min), \ 281 len(number_format % self._max)) 282 number_format = "%" + str(num_length) + number_format[1:] 283 format_string = "[%s, %s): %%s" % (number_format, number_format) 284 285 # Calculate the scale of the bars on the histogram 286 if show_bars: 287 maxval = max(self._bins) 288 if show_counts: 289 maxval_length = len(str(maxval)) 290 scale = maxval // (max_width-2*num_length-maxval_length-9) 291 else: 292 scale = maxval // (max_width-2*num_length-6) 293 scale = max(scale, 1) 294 295 result = ["N = %d, mean +- sd: %.4f +- %.4f" % \ 296 (self.n, self.mean, self.sd)] 297 298 if show_bars: 299 # Print the bars 300 if scale > 1: 301 result.append("Each * represents %d items" % scale) 302 if show_counts: 303 format_string += " (%d)" 304 for left, right, cnt in self.bins(): 305 result.append(format_string % (left, right, '*'*(cnt//scale), cnt)) 306 else: 307 for left, right, cnt in self.bins(): 308 result.append(format_string % (left, right, '*'*(cnt//scale))) 309 elif show_counts: 310 # Print the counts only 311 for left, right, cnt in self.bins(): 312 result.append(format_string % (left, right, cnt)) 313 314 return "\n".join(result) 315
316 - def __str__(self):
317 return self.to_string()
318
319 320 321 -class RunningMean(object):
322 """Running mean calculator. 323 324 This class can be used to calculate the mean of elements from a 325 list, tuple, iterable or any other data source. The mean is 326 calculated on the fly without explicitly summing the values, 327 so it can be used for data sets with arbitrary item count. Also 328 capable of returning the standard deviation (also calculated on 329 the fly) 330 """ 331 332 # pylint: disable-msg=C0103
333 - def __init__(self, items=None, n=0.0, mean=0.0, sd=0.0):
334 """RunningMean(items=None, n=0.0, mean=0.0, sd=0.0) 335 336 Initializes the running mean calculator. 337 338 There are two possible ways to initialize the calculator. 339 First, one can provide an iterable of items; alternatively, 340 one can specify the number of items, the mean and the 341 standard deviation if we want to continue an interrupted 342 calculation. 343 344 @param items: the items that are used to initialize the 345 running mean calcuator. If C{items} is given, C{n}, 346 C{mean} and C{sd} must be zeros. 347 @param n: the initial number of elements already processed. 348 If this is given, C{items} must be C{None}. 349 @param mean: the initial mean. If this is given, C{items} 350 must be C{None}. 351 @param sd: the initial standard deviation. If this is given, 352 C{items} must be C{None}.""" 353 if items is not None: 354 if n != 0 or mean != 0 or sd != 0: 355 raise ValueError("n, mean and sd must be zeros if items is not None") 356 self.clear() 357 self.add_many(items) 358 else: 359 self._nitems = float(n) 360 self._mean = float(mean) 361 if n > 1: 362 self._sqdiff = float(sd) ** 2 * float(n-1) 363 self._sd = float(sd) 364 else: 365 self._sqdiff = 0.0 366 self._sd = 0.0 367
368 - def add(self, value, repeat=1):
369 """RunningMean.add(value, repeat=1) 370 371 Adds the given value to the elements from which we calculate 372 the mean and the standard deviation. 373 374 @param value: the element to be added 375 @param repeat: number of repeated additions 376 """ 377 repeat = int(repeat) 378 self._nitems += repeat 379 delta = value - self._mean 380 self._mean += (repeat*delta / self._nitems) 381 self._sqdiff += (repeat*delta) * (value - self._mean) 382 if self._nitems > 1: 383 self._sd = (self._sqdiff / (self._nitems-1)) ** 0.5 384
385 - def add_many(self, values):
386 """RunningMean.add(values) 387 388 Adds the values in the given iterable to the elements from 389 which we calculate the mean. Can also accept a single number. 390 The left shift (C{<<}) operator is aliased to this function, 391 so you can use it to add elements as well: 392 393 >>> rm=RunningMean() 394 >>> rm << [1,2,3,4] 395 >>> rm.result # doctest:+ELLIPSIS 396 (2.5, 1.290994...) 397 398 @param values: the element(s) to be added 399 @type values: iterable""" 400 try: 401 iterator = iter(values) 402 except TypeError: 403 iterator = iter([values]) 404 for value in iterator: 405 self.add(value) 406
407 - def clear(self):
408 """Resets the running mean calculator.""" 409 self._nitems, self._mean = 0.0, 0.0 410 self._sqdiff, self._sd = 0.0, 0.0 411 412 @property
413 - def result(self):
414 """Returns the current mean and standard deviation as a tuple""" 415 return self._mean, self._sd 416 417 @property
418 - def mean(self):
419 """Returns the current mean""" 420 return self._mean 421 422 @property
423 - def sd(self):
424 """Returns the current standard deviation""" 425 return self._sd 426 427 @property
428 - def var(self):
429 """Returns the current variation""" 430 return self._sd ** 2 431
432 - def __repr__(self):
433 return "%s(n=%r, mean=%r, sd=%r)" % \ 434 (self.__class__.__name__, int(self._nitems), 435 self._mean, self._sd) 436
437 - def __str__(self):
438 return "Running mean (N=%d, %f +- %f)" % \ 439 (self._nitems, self._mean, self._sd) 440 441 __lshift__ = add_many 442
443 - def __float__(self):
444 return float(self._mean) 445
446 - def __int__(self):
447 return int(self._mean) 448
449 - def __long__(self):
450 return long(self._mean) 451
452 - def __complex__(self):
453 return complex(self._mean) 454
455 - def __len__(self):
456 return int(self._nitems)
457
458 459 -def mean(xs):
460 """Returns the mean of an iterable. 461 462 Example: 463 464 >>> mean([1, 4, 7, 11]) 465 5.75 466 467 @param xs: an iterable yielding numbers. 468 @return: the mean of the numbers provided by the iterable. 469 470 @see: RunningMean() if you also need the variance or the standard deviation 471 """ 472 return RunningMean(xs).mean 473
474 -def median(xs, sort=True):
475 """Returns the median of an unsorted or sorted numeric vector. 476 477 @param xs: the vector itself. 478 @param sort: whether to sort the vector. If you know that the vector is 479 sorted already, pass C{False} here. 480 @return: the median, which will always be a float, even if the vector 481 contained integers originally. 482 """ 483 if sort: 484 xs = sorted(xs) 485 486 mid = int(len(xs) / 2) 487 if 2 * mid == len(xs): 488 return float(xs[mid-1] + xs[mid]) / 2 489 else: 490 return float(xs[mid]) 491
492 -def percentile(xs, p=(25, 50, 75), sort=True):
493 """Returns the pth percentile of an unsorted or sorted numeric vector. 494 495 This is equivalent to calling quantile(xs, p/100.0); see L{quantile} 496 for more details on the calculation. 497 498 Example: 499 500 >>> round(percentile([15, 20, 40, 35, 50], 40), 2) 501 26.0 502 >>> for perc in percentile([15, 20, 40, 35, 50], (0, 25, 50, 75, 100)): 503 ... print "%.2f" % perc 504 ... 505 15.00 506 17.50 507 35.00 508 45.00 509 50.00 510 511 @param xs: the vector itself. 512 @param p: the percentile we are looking for. It may also be a list if you 513 want to calculate multiple quantiles with a single call. The default 514 value calculates the 25th, 50th and 75th percentile. 515 @param sort: whether to sort the vector. If you know that the vector is 516 sorted already, pass C{False} here. 517 @return: the pth percentile, which will always be a float, even if the vector 518 contained integers originally. If p is a list, the result will also be a 519 list containing the percentiles for each item in the list. 520 """ 521 if hasattr(p, "__iter__"): 522 return quantile(xs, (x/100.0 for x in p), sort) 523 return quantile(xs, p/100.0, sort) 524
525 -def power_law_fit(data, xmin=None, method="auto", return_alpha_only=False):
526 """Fitting a power-law distribution to empirical data 527 528 @param data: the data to fit, a list containing integer values 529 @param xmin: the lower bound for fitting the power-law. If C{None}, 530 the optimal xmin value will be estimated as well. Zero means that 531 the smallest possible xmin value will be used. 532 @param method: the fitting method to use. The following methods are 533 implemented so far: 534 535 - C{continuous}, C{hill}: exact maximum likelihood estimation 536 when the input data comes from a continuous scale. This is 537 known as the Hill estimator. The statistical error of 538 this estimator is M{(alpha-1) / sqrt(n)}, where alpha is the 539 estimated exponent and M{n} is the number of data points above 540 M{xmin}. The estimator is known to exhibit a small finite 541 sample-size bias of order M{O(n^-1)}, which is small when 542 M{n > 100}. igraph will try to compensate for the finite sample 543 size if n is small. 544 545 - C{discrete}: exact maximum likelihood estimation when the 546 input comes from a discrete scale (see Clauset et al among the 547 references). 548 549 - C{auto}: exact maximum likelihood estimation where the continuous 550 method is used if the input vector contains at least one fractional 551 value and the discrete method is used if the input vector contains 552 integers only. 553 554 @return: a L{FittedPowerLaw} object. The fitted C{xmin} value and the 555 power-law exponent can be queried from the C{xmin} and C{alpha} 556 properties of the returned object. 557 558 @newfield ref: Reference 559 @ref: MEJ Newman: Power laws, Pareto distributions and Zipf's law. 560 Contemporary Physics 46, 323-351 (2005) 561 @ref: A Clauset, CR Shalizi, MEJ Newman: Power-law distributions 562 in empirical data. E-print (2007). arXiv:0706.1062""" 563 from igraph._igraph import _power_law_fit 564 565 if xmin is None or xmin < 0: 566 xmin = -1 567 568 method = method.lower() 569 if method not in ("continuous", "hill", "discrete", "auto"): 570 raise ValueError("unknown method: %s" % method) 571 572 force_continuous = method in ("continuous", "hill") 573 fit = FittedPowerLaw(*_power_law_fit(data, xmin, force_continuous)) 574 if return_alpha_only: 575 from igraph import deprecated 576 deprecated("The return_alpha_only keyword argument of power_law_fit is "\ 577 "deprecated from igraph 0.7 and will be removed in igraph 0.8") 578 return fit.alpha 579 else: 580 return fit 581
582 -def quantile(xs, q=(0.25, 0.5, 0.75), sort=True):
583 """Returns the qth quantile of an unsorted or sorted numeric vector. 584 585 There are a number of different ways to calculate the sample quantile. The 586 method implemented by igraph is the one recommended by NIST. First we 587 calculate a rank n as q(N+1), where N is the number of items in xs, then we 588 split n into its integer component k and decimal component d. If k <= 1, 589 we return the first element; if k >= N, we return the last element, 590 otherwise we return the linear interpolation between xs[k-1] and xs[k] 591 using a factor d. 592 593 Example: 594 595 >>> round(quantile([15, 20, 40, 35, 50], 0.4), 2) 596 26.0 597 598 @param xs: the vector itself. 599 @param q: the quantile we are looking for. It may also be a list if you 600 want to calculate multiple quantiles with a single call. The default 601 value calculates the 25th, 50th and 75th percentile. 602 @param sort: whether to sort the vector. If you know that the vector is 603 sorted already, pass C{False} here. 604 @return: the qth quantile, which will always be a float, even if the vector 605 contained integers originally. If q is a list, the result will also be a 606 list containing the quantiles for each item in the list. 607 """ 608 if not xs: 609 raise ValueError("xs must not be empty") 610 611 if sort: 612 xs = sorted(xs) 613 614 if hasattr(q, "__iter__"): 615 qs = q 616 return_single = False 617 else: 618 qs = [q] 619 return_single = True 620 621 result = [] 622 for q in qs: 623 if q < 0 or q > 1: 624 raise ValueError("q must be between 0 and 1") 625 n = float(q) * (len(xs)+1) 626 k, d = int(n), n-int(n) 627 if k >= len(xs): 628 result.append(xs[-1]) 629 elif k < 1: 630 result.append(xs[0]) 631 else: 632 result.append((1-d) * xs[k-1] + d * xs[k]) 633 if return_single: 634 result = result[0] 635 return result 636
637 -def sd(xs):
638 """Returns the standard deviation of an iterable. 639 640 Example: 641 642 >>> sd([1, 4, 7, 11]) #doctest:+ELLIPSIS 643 4.2720... 644 645 @param xs: an iterable yielding numbers. 646 @return: the standard deviation of the numbers provided by the iterable. 647 648 @see: RunningMean() if you also need the mean 649 """ 650 return RunningMean(xs).sd 651
652 -def var(xs):
653 """Returns the variance of an iterable. 654 655 Example: 656 657 >>> var([1, 4, 8, 11]) #doctest:+ELLIPSIS 658 19.333333... 659 660 @param xs: an iterable yielding numbers. 661 @return: the variance of the numbers provided by the iterable. 662 663 @see: RunningMean() if you also need the mean 664 """ 665 return RunningMean(xs).var 666

   Home       Trees       Indices       Help