python-igraph manual

For using igraph from Python

Home       Trees       Indices       Help

Source Code for Module igraph.statistics

```  1  # vim:ts=4:sw=4:sts=4:et
2  # -*- coding: utf-8 -*-
3  """
4  Statistics related stuff in igraph
5  """
6
8  Copyright (C) 2006-2012  Tamas Nepusz <ntamas@gmail.com>
9  Pázmány Péter sétány 1/a, 1117 Budapest, Hungary
10
11  This program is free software; you can redistribute it and/or modify
13  the Free Software Foundation; either version 2 of the License, or
14  (at your option) any later version.
15
16  This program is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  GNU General Public License for more details.
20
21  You should have received a copy of the GNU General Public License
22  along with this program; if not, write to the Free Software
23  Foundation, Inc.,  51 Franklin Street, Fifth Floor, Boston, MA
24  02110-1301 USA
25  """
26
27  import math
28
29  __all__ = ["FittedPowerLaw", "Histogram", "RunningMean", "mean", "median", \
30          "percentile", "quantile", "power_law_fit"]
31
32
33 -class FittedPowerLaw(object):
34      """Result of fitting a power-law to a vector of samples
35
36      Example:
37
38          >>> result = power_law_fit([1, 2, 3, 4, 5, 6])
39          >>> result                   # doctest:+ELLIPSIS
40          FittedPowerLaw(continuous=False, alpha=2.425828..., xmin=3.0, L=-7.54633..., D=0.2138..., p=0.99311...)
41          >>> print result             # doctest:+ELLIPSIS
42          Fitted power-law distribution on discrete data
43          <BLANKLINE>
44          Exponent (alpha)  = 2.425828
45          Cutoff (xmin)     = 3.000000
46          <BLANKLINE>
47          Log-likelihood    = -7.546337
48          <BLANKLINE>
49          H0: data was drawn from the fitted distribution
50          <BLANKLINE>
51          KS test statistic = 0.213817
52          p-value           = 0.993111
53          <BLANKLINE>
54          H0 could not be rejected at significance level 0.05
55          >>> result.alpha             # doctest:+ELLIPSIS
56          2.425828...
57          >>> result.xmin
58          3.0
59          >>> result.continuous
60          False
61      """
62
63 -    def __init__(self, continuous, alpha, xmin, L, D, p):
64          self.continuous = continuous
65          self.xmin = xmin
66          self.alpha = alpha
67          self.L = L
68          self.D = D
69          self.p = p
70
71 -    def __repr__(self):
72          return "%s(continuous=%r, alpha=%r, xmin=%r, L=%r, D=%r, p=%r)" % \
73                  (self.__class__.__name__, self.continuous, self.alpha, \
74                  self.xmin, self.L, self.D, self.p)
75
76 -    def __str__(self):
77          return self.summary(significance=0.05)
78
79 -    def summary(self, significance=0.05):
80          """Returns the summary of the power law fit.
81
82          @param significance: the significance level of the Kolmogorov-Smirnov test
83            used to decide whether the input data could have come from the fitted
84            distribution
85          @return: the summary as a string
86          """
87          result = ["Fitted power-law distribution on %s data" % \
88                  ("discrete", "continuous")[bool(self.continuous)]]
89          result.append("")
90          result.append("Exponent (alpha)  = %f" % self.alpha)
91          result.append("Cutoff (xmin)     = %f" % self.xmin)
92          result.append("")
93          result.append("Log-likelihood    = %f" % self.L)
94          result.append("")
95          result.append("H0: data was drawn from the fitted distribution")
96          result.append("")
97          result.append("KS test statistic = %f" % self.D)
98          result.append("p-value           = %f" % self.p)
99          result.append("")
100          if self.p < significance:
101              result.append("H0 rejected at significance level %g" \
102                      % significance)
103          else:
104              result.append("H0 could not be rejected at significance "\
105                      "level %g" % significance)
106
107          return "\n".join(result)
108
109
110 -class Histogram(object):
111      """Generic histogram class for real numbers
112
113      Example:
114
115          >>> h = Histogram(5)     # Initializing, bin width = 5
116          >>> h << [2,3,2,7,8,5,5,0,7,9]     # Adding more items
117          >>> print h
118          N = 10, mean +- sd: 4.8000 +- 2.9740
119          [ 0,  5): **** (4)
120          [ 5, 10): ****** (6)
121      """
122
123 -    def __init__(self, bin_width = 1, data = None):
124          """Initializes the histogram with the given data set.
125
126          @param bin_width: the bin width of the histogram.
127          @param data: the data set to be used. Must contain real numbers.
128          """
129          self._bin_width = float(bin_width)
130          self._bins = None
131          self._min, self._max = None, None
132          self._running_mean = RunningMean()
133          self.clear()
134
135          if data:
137
138 -    def _get_bin(self, num, create = False):
139          """Returns the bin index corresponding to the given number.
140
141          @param num: the number for which the bin is being sought
142          @param create: whether to create a new bin if no bin exists yet.
143          @return: the index of the bin or C{None} if no bin exists yet and
144            {create} is C{False}."""
145          if len(self._bins) == 0:
146              if not create:
147                  result = None
148              else:
149                  self._min = int(num/self._bin_width)*self._bin_width
150                  self._max = self._min+self._bin_width
151                  self._bins = [0]
152                  result = 0
153              return result
154
155          if num >= self._min:
156              binidx = int((num-self._min)/self._bin_width)
157              if binidx < len(self._bins):
158                  return binidx
159              if not create:
160                  return None
161              extra_bins = binidx-len(self._bins)+1
162              self._bins.extend([0]*extra_bins)
163              self._max = self._min + len(self._bins)*self._bin_width
164              return binidx
165
166          if not create:
167              return None
168
169          extra_bins = int(math.ceil((self._min-num)/self._bin_width))
170          self._bins[0:0] = [0]*extra_bins
171          self._min -= extra_bins*self._bin_width
172          self._max = self._min + len(self._bins)*self._bin_width
173          return 0
174
175      @property
176 -    def n(self):
177          """Returns the number of elements in the histogram"""
178          return len(self._running_mean)
179
180      @property
181 -    def mean(self):
182          """Returns the mean of the elements in the histogram"""
183          return self._running_mean.mean
184
185      # pylint: disable-msg=C0103
186      @property
187 -    def sd(self):
188          """Returns the standard deviation of the elements in
189          the histogram"""
190          return self._running_mean.sd
191
192      @property
193 -    def var(self):
194          """Returns the variance of the elements in the histogram"""
195          return self._running_mean.var
196
197 -    def add(self, num, repeat=1):
198          """Adds a single number to the histogram.
199
200          @param num: the number to be added
201          @param repeat: number of repeated additions
202          """
203          num = float(num)
204          binidx = self._get_bin(num, True)
205          self._bins[binidx] += repeat
207
209          """Adds a single number or the elements of an iterable to the histogram.
210
211          @param data: the data to be added"""
212          try:
213              iterator = iter(data)
214          except TypeError:
215              iterator = iter([data])
216          for x in iterator:
219
220 -    def clear(self):
221          """Clears the collected data"""
222          self._bins = []
223          self._min, self._max = None, None
224          self._running_mean = RunningMean()
225
226 -    def bins(self):
227          """Generator returning the bins of the histogram in increasing order
228
229          @return: a tuple with the following elements: left bound, right bound,
230            number of elements in the bin"""
231          x = self._min
232          for elem in self._bins:
233              yield (x, x+self._bin_width, elem)
234              x += self._bin_width
235
236 -    def __plot__(self, context, bbox, _, **kwds):
237          """Plotting support"""
238          from igraph.drawing.coord import DescartesCoordinateSystem
239          coord_system = DescartesCoordinateSystem(context, bbox, \
240              (kwds.get("min", self._min), 0, \
241               kwds.get("max", self._max), kwds.get("max_value", max(self._bins))
242              ))
243
244          # Draw the boxes
245          context.set_line_width(1)
246          context.set_source_rgb(1., 0., 0.)
247          x = self._min
248          for value in self._bins:
249              top_left_x, top_left_y = coord_system.local_to_context(x, value)
250              x += self._bin_width
251              bottom_right_x, bottom_right_y = coord_system.local_to_context(x, 0)
252              context.rectangle(top_left_x, top_left_y, \
253                                bottom_right_x - top_left_x, \
254                                bottom_right_y - top_left_y)
255              context.fill()
256
257          # Draw the axes
258          coord_system.draw()
259
260 -    def to_string(self, max_width=78, show_bars=True, show_counts=True):
261          """Returns the string representation of the histogram.
262
263          @param max_width: the maximal width of each line of the string
264            This value may not be obeyed if it is too small.
265          @param show_bars: specify whether the histogram bars should be shown
266          @param show_counts: specify whether the histogram counts should be
267            shown. If both I{show_bars} and I{show_counts} are C{False},
268            only a general descriptive statistics (number of elements, mean and
269            standard deviation) is shown.
270          """
271
272          if self._min is None or self._max is None:
273              return "N = 0"
274
275          # Determine how many decimal digits should we use
276          if int(self._min) == self._min and int(self._bin_width) == self._bin_width:
277              number_format = "%d"
278          else:
279              number_format = "%.3f"
280          num_length = max(len(number_format % self._min), \
281                           len(number_format % self._max))
282          number_format = "%" + str(num_length) + number_format[1:]
283          format_string = "[%s, %s): %%s" % (number_format, number_format)
284
285          # Calculate the scale of the bars on the histogram
286          if show_bars:
287              maxval = max(self._bins)
288              if show_counts:
289                  maxval_length = len(str(maxval))
290                  scale = maxval // (max_width-2*num_length-maxval_length-9)
291              else:
292                  scale = maxval // (max_width-2*num_length-6)
293              scale = max(scale, 1)
294
295          result = ["N = %d, mean +- sd: %.4f +- %.4f" % \
296              (self.n, self.mean, self.sd)]
297
298          if show_bars:
299              # Print the bars
300              if scale > 1:
301                  result.append("Each * represents %d items" % scale)
302              if show_counts:
303                  format_string += " (%d)"
304                  for left, right, cnt in self.bins():
305                      result.append(format_string % (left, right, '*'*(cnt//scale), cnt))
306              else:
307                  for left, right, cnt in self.bins():
308                      result.append(format_string % (left, right, '*'*(cnt//scale)))
309          elif show_counts:
310              # Print the counts only
311              for left, right, cnt in self.bins():
312                  result.append(format_string % (left, right, cnt))
313
314          return "\n".join(result)
315
316 -    def __str__(self):
317          return self.to_string()
318
319
320
321 -class RunningMean(object):
322      """Running mean calculator.
323
324      This class can be used to calculate the mean of elements from a
325      list, tuple, iterable or any other data source. The mean is
326      calculated on the fly without explicitly summing the values,
327      so it can be used for data sets with arbitrary item count. Also
328      capable of returning the standard deviation (also calculated on
329      the fly)
330      """
331
332      # pylint: disable-msg=C0103
333 -    def __init__(self, items=None, n=0.0, mean=0.0, sd=0.0):
334          """RunningMean(items=None, n=0.0, mean=0.0, sd=0.0)
335
336          Initializes the running mean calculator.
337
338          There are two possible ways to initialize the calculator.
339          First, one can provide an iterable of items; alternatively,
340          one can specify the number of items, the mean and the
341          standard deviation if we want to continue an interrupted
342          calculation.
343
344          @param items: the items that are used to initialize the
345            running mean calcuator. If C{items} is given, C{n},
346            C{mean} and C{sd} must be zeros.
347          @param n: the initial number of elements already processed.
348            If this is given, C{items} must be C{None}.
349          @param mean: the initial mean. If this is given, C{items}
350            must be C{None}.
351          @param sd: the initial standard deviation. If this is given,
352            C{items} must be C{None}."""
353          if items is not None:
354              if n != 0 or mean != 0 or sd != 0:
355                  raise ValueError("n, mean and sd must be zeros if items is not None")
356              self.clear()
358          else:
359              self._nitems = float(n)
360              self._mean = float(mean)
361              if n > 1:
362                  self._sqdiff = float(sd) ** 2 * float(n-1)
363                  self._sd = float(sd)
364              else:
365                  self._sqdiff = 0.0
366                  self._sd = 0.0
367
368 -    def add(self, value, repeat=1):
370
371          Adds the given value to the elements from which we calculate
372          the mean and the standard deviation.
373
374          @param value: the element to be added
375          @param repeat: number of repeated additions
376          """
377          repeat = int(repeat)
378          self._nitems += repeat
379          delta = value - self._mean
380          self._mean += (repeat*delta / self._nitems)
381          self._sqdiff += (repeat*delta) * (value - self._mean)
382          if self._nitems > 1:
383              self._sd = (self._sqdiff / (self._nitems-1)) ** 0.5
384
387
388          Adds the values in the given iterable to the elements from
389          which we calculate the mean. Can also accept a single number.
390          The left shift (C{<<}) operator is aliased to this function,
391          so you can use it to add elements as well:
392
393            >>> rm=RunningMean()
394            >>> rm << [1,2,3,4]
395            >>> rm.result               # doctest:+ELLIPSIS
396            (2.5, 1.290994...)
397
398          @param values: the element(s) to be added
399          @type values: iterable"""
400          try:
401              iterator = iter(values)
402          except TypeError:
403              iterator = iter([values])
404          for value in iterator:
406
407 -    def clear(self):
408          """Resets the running mean calculator."""
409          self._nitems, self._mean = 0.0, 0.0
410          self._sqdiff, self._sd = 0.0, 0.0
411
412      @property
413 -    def result(self):
414          """Returns the current mean and standard deviation as a tuple"""
415          return self._mean, self._sd
416
417      @property
418 -    def mean(self):
419          """Returns the current mean"""
420          return self._mean
421
422      @property
423 -    def sd(self):
424          """Returns the current standard deviation"""
425          return self._sd
426
427      @property
428 -    def var(self):
429          """Returns the current variation"""
430          return self._sd ** 2
431
432 -    def __repr__(self):
433          return "%s(n=%r, mean=%r, sd=%r)" % \
434                  (self.__class__.__name__, int(self._nitems),
435                          self._mean, self._sd)
436
437 -    def __str__(self):
438          return "Running mean (N=%d, %f +- %f)" % \
439              (self._nitems, self._mean, self._sd)
440
442
443 -    def __float__(self):
444          return float(self._mean)
445
446 -    def __int__(self):
447          return int(self._mean)
448
449 -    def __long__(self):
450          return long(self._mean)
451
452 -    def __complex__(self):
453          return complex(self._mean)
454
455 -    def __len__(self):
456          return int(self._nitems)
457
458
459 -def mean(xs):
460      """Returns the mean of an iterable.
461
462      Example:
463
464          >>> mean([1, 4, 7, 11])
465          5.75
466
467      @param xs: an iterable yielding numbers.
468      @return: the mean of the numbers provided by the iterable.
469
470      @see: RunningMean() if you also need the variance or the standard deviation
471      """
472      return RunningMean(xs).mean
473
474 -def median(xs, sort=True):
475      """Returns the median of an unsorted or sorted numeric vector.
476
477      @param xs: the vector itself.
478      @param sort: whether to sort the vector. If you know that the vector is
479        sorted already, pass C{False} here.
480      @return: the median, which will always be a float, even if the vector
481        contained integers originally.
482      """
483      if sort:
484          xs = sorted(xs)
485
486      mid = int(len(xs) / 2)
487      if 2 * mid == len(xs):
488          return float(xs[mid-1] + xs[mid]) / 2
489      else:
490          return float(xs[mid])
491
492 -def percentile(xs, p=(25, 50, 75), sort=True):
493      """Returns the pth percentile of an unsorted or sorted numeric vector.
494
495      This is equivalent to calling quantile(xs, p/100.0); see L{quantile}
496      for more details on the calculation.
497
498      Example:
499
500          >>> round(percentile([15, 20, 40, 35, 50], 40), 2)
501          26.0
502          >>> for perc in percentile([15, 20, 40, 35, 50], (0, 25, 50, 75, 100)):
503          ...     print "%.2f" % perc
504          ...
505          15.00
506          17.50
507          35.00
508          45.00
509          50.00
510
511      @param xs: the vector itself.
512      @param p: the percentile we are looking for. It may also be a list if you
513        want to calculate multiple quantiles with a single call. The default
514        value calculates the 25th, 50th and 75th percentile.
515      @param sort: whether to sort the vector. If you know that the vector is
516        sorted already, pass C{False} here.
517      @return: the pth percentile, which will always be a float, even if the vector
518        contained integers originally. If p is a list, the result will also be a
519        list containing the percentiles for each item in the list.
520      """
521      if hasattr(p, "__iter__"):
522          return quantile(xs, (x/100.0 for x in p), sort)
523      return quantile(xs, p/100.0, sort)
524
525 -def power_law_fit(data, xmin=None, method="auto", return_alpha_only=False):
526      """Fitting a power-law distribution to empirical data
527
528      @param data: the data to fit, a list containing integer values
529      @param xmin: the lower bound for fitting the power-law. If C{None},
530        the optimal xmin value will be estimated as well. Zero means that
531        the smallest possible xmin value will be used.
532      @param method: the fitting method to use. The following methods are
533        implemented so far:
534
535          - C{continuous}, C{hill}: exact maximum likelihood estimation
536            when the input data comes from a continuous scale. This is
537            known as the Hill estimator. The statistical error of
538            this estimator is M{(alpha-1) / sqrt(n)}, where alpha is the
539            estimated exponent and M{n} is the number of data points above
540            M{xmin}. The estimator is known to exhibit a small finite
541            sample-size bias of order M{O(n^-1)}, which is small when
542            M{n > 100}. igraph will try to compensate for the finite sample
543            size if n is small.
544
545          - C{discrete}: exact maximum likelihood estimation when the
546            input comes from a discrete scale (see Clauset et al among the
547            references).
548
549          - C{auto}: exact maximum likelihood estimation where the continuous
550            method is used if the input vector contains at least one fractional
551            value and the discrete method is used if the input vector contains
552            integers only.
553
554      @return: a L{FittedPowerLaw} object. The fitted C{xmin} value and the
555        power-law exponent can be queried from the C{xmin} and C{alpha}
556        properties of the returned object.
557
558      @newfield ref: Reference
559      @ref: MEJ Newman: Power laws, Pareto distributions and Zipf's law.
560        Contemporary Physics 46, 323-351 (2005)
561      @ref: A Clauset, CR Shalizi, MEJ Newman: Power-law distributions
562        in empirical data. E-print (2007). arXiv:0706.1062"""
563      from igraph._igraph import _power_law_fit
564
565      if xmin is None or xmin < 0:
566          xmin = -1
567
568      method = method.lower()
569      if method not in ("continuous", "hill", "discrete", "auto"):
570          raise ValueError("unknown method: %s" % method)
571
572      force_continuous = method in ("continuous", "hill")
573      fit = FittedPowerLaw(*_power_law_fit(data, xmin, force_continuous))
574      if return_alpha_only:
575          from igraph import deprecated
576          deprecated("The return_alpha_only keyword argument of power_law_fit is "\
577                  "deprecated from igraph 0.7 and will be removed in igraph 0.8")
578          return fit.alpha
579      else:
580          return fit
581
582 -def quantile(xs, q=(0.25, 0.5, 0.75), sort=True):
583      """Returns the qth quantile of an unsorted or sorted numeric vector.
584
585      There are a number of different ways to calculate the sample quantile. The
586      method implemented by igraph is the one recommended by NIST. First we
587      calculate a rank n as q(N+1), where N is the number of items in xs, then we
588      split n into its integer component k and decimal component d. If k <= 1,
589      we return the first element; if k >= N, we return the last element,
590      otherwise we return the linear interpolation between xs[k-1] and xs[k]
591      using a factor d.
592
593      Example:
594
595          >>> round(quantile([15, 20, 40, 35, 50], 0.4), 2)
596          26.0
597
598      @param xs: the vector itself.
599      @param q: the quantile we are looking for. It may also be a list if you
600        want to calculate multiple quantiles with a single call. The default
601        value calculates the 25th, 50th and 75th percentile.
602      @param sort: whether to sort the vector. If you know that the vector is
603        sorted already, pass C{False} here.
604      @return: the qth quantile, which will always be a float, even if the vector
605        contained integers originally. If q is a list, the result will also be a
606        list containing the quantiles for each item in the list.
607      """
608      if not xs:
609          raise ValueError("xs must not be empty")
610
611      if sort:
612          xs = sorted(xs)
613
614      if hasattr(q, "__iter__"):
615          qs = q
616          return_single = False
617      else:
618          qs = [q]
619          return_single = True
620
621      result = []
622      for q in qs:
623          if q < 0 or q > 1:
624              raise ValueError("q must be between 0 and 1")
625          n = float(q) * (len(xs)+1)
626          k, d = int(n), n-int(n)
627          if k >= len(xs):
628              result.append(xs[-1])
629          elif k < 1:
630              result.append(xs[0])
631          else:
632              result.append((1-d) * xs[k-1] + d * xs[k])
633      if return_single:
634          result = result[0]
635      return result
636
637 -def sd(xs):
638      """Returns the standard deviation of an iterable.
639
640      Example:
641
642          >>> sd([1, 4, 7, 11])       #doctest:+ELLIPSIS
643          4.2720...
644
645      @param xs: an iterable yielding numbers.
646      @return: the standard deviation of the numbers provided by the iterable.
647
648      @see: RunningMean() if you also need the mean
649      """
650      return RunningMean(xs).sd
651
652 -def var(xs):
653      """Returns the variance of an iterable.
654
655      Example:
656
657          >>> var([1, 4, 8, 11])            #doctest:+ELLIPSIS
658          19.333333...
659
660      @param xs: an iterable yielding numbers.
661      @return: the variance of the numbers provided by the iterable.
662
663      @see: RunningMean() if you also need the mean
664      """
665      return RunningMean(xs).var
666
<!--
expandto(location.href);
// -->

```

Home       Trees       Indices       Help
 Generated by Epydoc 3.0.1 on Fri May 10 10:51:16 2019 http://epydoc.sourceforge.net