@article{albarrn2011references, abstract = {This article studies massive evidence about references made and citations received after a 5-year citation window by 3.7 million articles published in 1998 to 2002 in 22 scientific fields. We find that the distributions of references made and citations received share a number of basic features across sciences. Reference distributions are rather skewed to the right while citation distributions are even more highly skewed: The mean is about 20 percentage points to the right of the median, and articles with a remarkable or an outstanding number of citations represent about 9% of the total. Moreover, the existence of a power law representing the upper tail of citation distributions cannot be rejected in 17 fields whose articles represent 74.7% of the total. Contrary to the evidence in other contexts, the value of the scale parameter is above 3.5 in 13 of the 17 cases. Finally, power laws are typically small, but capture a considerable proportion of the total citations received.}, author = {Albarrán, Pedro and Ruiz-Castillo, Javier}, doi = {10.1002/asi.21448}, interhash = {79502663727fcbd4834a423f4e3212a3}, intrahash = {f20e50e960696bab3b39b628718dd850}, issn = {1532-2890}, journal = {Journal of the American Society for Information Science and Technology}, number = 1, pages = {40--49}, publisher = {Wiley Subscription Services, Inc., A Wiley Company}, title = {References made and citations received by scientific articles}, url = {http://dx.doi.org/10.1002/asi.21448}, volume = 62, year = 2011 } @misc{clauset2007powerlaw, abstract = {Power-law distributions occur in many situations of scientific interest and have significant consequences for our understanding of natural and man-made phenomena. Unfortunately, the detection and characterization of power laws is complicated by the large fluctuations that occur in the tail of the distribution -- the part of the distribution representing large but rare events -- and by the difficulty of identifying the range over which power-law behavior holds. Commonly used methods for analyzing power-law data, such as least-squares fitting, can produce substantially inaccurate estimates of parameters for power-law distributions, and even in cases where such methods return accurate answers they are still unsatisfactory because they give no indication of whether the data obey a power law at all. Here we present a principled statistical framework for discerning and quantifying power-law behavior in empirical data. Our approach combines maximum-likelihood fitting methods with goodness-of-fit tests based on the Kolmogorov-Smirnov statistic and likelihood ratios. We evaluate the effectiveness of the approach with tests on synthetic data and give critical comparisons to previous approaches. We also apply the proposed methods to twenty-four real-world data sets from a range of different disciplines, each of which has been conjectured to follow a power-law distribution. In some cases we find these conjectures to be consistent with the data while in others the power law is ruled out.}, author = {Clauset, Aaron and Shalizi, Cosma Rohilla and Newman, M. E. J.}, doi = {10.1137/070710111}, interhash = {2e3bc5bbd7449589e8bfb580e8936d4b}, intrahash = {7da1624e601898dd74df839ce2daeb24}, note = {cite arxiv:0706.1062Comment: 43 pages, 11 figures, 7 tables, 4 appendices; code available at http://www.santafe.edu/~aaronc/powerlaws/}, title = {Power-law distributions in empirical data}, url = {http://arxiv.org/abs/0706.1062}, year = 2007 } @article{rauber2009webarchivierung, abstract = { In den letzten Jahren haben Bibliotheken und Archive zunehmend die Aufgabe übernommen, neben konventionellen Publikationen auch Inhalte aus dem World Wide Web zu sammeln, um so diesen wertvollen Teil unseres kulturellen Erbes zu bewahren und wichtige Informationen langfristig verfügbar zu halten. Diese massiven Datensammlungen bieten faszinierende Möglichkeiten, rasch Zugriff auf wichtige Informationen zu bekommen, die im Live-Web bereits verloren gegangen sind. Sie sind eine unentbehrliche Quelle für Wissenschaftler, die in der Zukunft die gesellschaftliche und technologische Entwicklung unserer Zeit nachvollziehen wollen. Auf der anderen Seite stellt eine derartige Datensammlung aber einen völlig neuen Datenbestand dar, der nicht nur rechtliche, sondern auch zahlreiche ethische Fragen betreffend seine Nutzung aufwirft. Diese werden in dem Ausmaß zunehmen, in dem die technischen Möglichkeiten zur automatischen Analyse und Interpretation dieser Daten leistungsfähiger werden. Da sich die meisten Webarchivierungsinitiativen dieser Problematik bewusst sind, bleibt die Nutzung der Daten derzeit meist stark eingeschränkt, oder es wird eine Art von "Opt-Out"-Möglichkeit vorgesehen, wodurch Webseiteninhaber die Aufnahme ihrer Seiten in ein Webarchiv ausschließen können. Mit beiden Ansätzen können Webarchive ihr volles Nutzungspotenzial nicht ausschöpfen. Dieser Artikel beschreibt einleitend kurz die Technologien, die zur Sammlung von Webinhalten zu Archivierungszwecken verwendet werden. Er hinterfragt Annahmen, die die freie Verfügbarkeit der Daten und unterschiedliche Nutzungsarten betreffen. Darauf aufbauend identifiziert er eine Reihe von offenen Fragen, deren Lösung einen breiteren Zugriff und bessere Nutzung von Webarchiven erlauben könnte. }, author = {Rauber, Andreas and Kaiser, Max}, editor = {Knoll, Matthias and Meier, Andreas}, interhash = {3b35b676a2817868d93481aeebfa4154}, intrahash = {cdaef18169a7d8300cf54daf018a74cc}, issn = {1436-3011}, journal = {HMD Praxis der Wirtschaftsinformatik}, month = aug, publisher = {dpunkt.verlag}, title = {Webarchivierung und Web Archive Mining: Notwendigkeit, Probleme und Lösungsansätze}, url = {http://hmd.dpunkt.de/268/03.php}, volume = 268, year = 2009 } @article{clauset2009powerlaw, abstract = {Power-law distributions occur in many situations of scientific interest and have significant consequences for our understanding of natural and man-made phenomena. Unfortunately, the detection and characterization of power laws is complicated by the large fluctuations that occur in the tail of the distribution—the part of the distribution representing large but rare events—and by the difficulty of identifying the range over which power-law behavior holds. Commonly used methods for analyzing power-law data, such as least-squares fitting, can produce substantially inaccurate estimates of parameters for power-law distributions, and even in cases where such methods return accurate answers they are still unsatisfactory because they give no indication of whether the data obey a power law at all. Here we present a principled statistical framework for discerning and quantifying power-law behavior in empirical data. Our approach combines maximum-likelihood fitting methods with goodness-of-fit tests based on the Kolmogorov–Smirnov (KS) statistic and likelihood ratios. We evaluate the effectiveness of the approach with tests on synthetic data and give critical comparisons to previous approaches. We also apply the proposed methods to twenty-four real-world data sets from a range of different disciplines, each of which has been conjectured to follow a power-law distribution. In some cases we find these conjectures to be consistent with the data, while in others the power law is ruled out.}, author = {Clauset, Aaron and Shalizi, Cosma Rohilla and Newman, M. E. J.}, doi = {10.1137/070710111}, interhash = {9ce8658af5a6358a758bfdb819f73394}, intrahash = {c0097d202655474b1db6811ddea03410}, issn = {0036-1445}, journal = {SIAM Review}, number = 4, pages = {661--703}, publisher = {SIAM}, title = {Power-Law Distributions in Empirical Data}, url = {http://link.aip.org/link/?SIR/51/661/1}, volume = 51, year = 2009 } @article{mitzenmacher2004history, abstract = {Recently, I became interested in a current debate over whether file size distributions are best modelled by a power law distribution or a lognormal distribution. In trying to learn enough about these distributions to settle the question, I found a rich and long history, spanning many fields. Indeed, several recently proposed models from the computer science community have antecedents in work from decades ago. Here, I briefly survey some of this history, focusing on underlying generative models that lead to these distributions. One finding is that lognormal and power law distributions connect quite naturally, and hence, it is not surprising that lognormal distributions have arisen as a possible alternative to power law distributions across many fields. }, author = {Mitzenmacher, M.}, interhash = {50b0caa36c6cbc1ecfa0714157f06bd1}, intrahash = {acdeb6b7980b25477665939c191f1e40}, journal = {Internet Mathematics}, number = 2, pages = {226--251}, title = {A Brief History of Generative Models for Power Law and Lognormal Distributions }, url = {http://www.eecs.harvard.edu/~michaelm/CS223/powerlaw.pdf}, volume = 1, year = 2004 } @misc{newman2004power, abstract = {When the probability of measuring a particular value of some quantity varies inversely as a power of that value, the quantity is said to follow a power law, also known variously as Zipf's law or the Pareto distribution. Power laws appear widely in physics, biology, earth and planetary sciences, economics and finance, computer science, demography and the social sciences. For instance, the distributions of the sizes of cities, earthquakes, solar flares, moon craters, wars and people's personal fortunes all appear to follow power laws. The origin of power-law behaviour has been a topic of debate in the scientific community for more than a century. Here we review some of the empirical evidence for the existence of power-law forms and the theories proposed to explain them. }, author = {Newman, M. E. J.}, interhash = {0e71ef0a12837211faa22d9f16eda4a8}, intrahash = {561772806731f6afcdc0c707e34662dd}, title = {Power laws, Pareto distributions and Zipf's law}, url = {http://arxiv.org/abs/cond-mat/0412004}, year = 2004 } @book{partridge1998italian, asin = {071904944X}, author = {Partridge, Hilary}, dewey = {320.94509045}, ean = {9780719049446}, interhash = {6543bf6a87d51c6e6733b48c12f5feb8}, intrahash = {e2cef45649dd05d13a4d9c9142a8e7d0}, isbn = {071904944X}, publisher = {Manchester University Press}, title = {Italian Politics Today}, url = {http://www.amazon.com/Italian-Politics-Today-Hilary-Partridge/dp/071904944X}, year = 1998 } @unpublished{hayes:ubt, abstract = {The Web has experienced an exponential growth in the use of weblogs or blogs. Blog entries are generally organised using tags, informally defined labels which are increasingly being proposed as a 'grassroots' answer to Semantic Web standards. Despite this, tags have been shown to be weak at partitioning blog data. In this paper, we demonstrate how tags provide useful, discriminating information where the blog corpus is initially partitioned using a conventional clustering technique. Using extensive empirical evaluation we demonstrate how tag cloud information within each cluster allows us to identify potentially strong topic 'authorities' in each cluster. We conclude that tags have a key auxiliary role in refining and confirming the information produced using typical knowledge discovery techniques.}, author = {Hayes, C. and Avesani, P.}, interhash = {77518d83a3d52d0a104150f9dd7b55ab}, intrahash = {5d211ac9dd807b795e6f5b87e0488222}, note = {'tag based clustering may not be adapted for this kind of distribution [power law]'}, title = {{Using Blog Tags To Identify Topic Authorities}}, url = {https://apps.lis.uiuc.edu/wiki/display/LEADS/Fall+2007+Archive}, year = 2007 } @inproceedings{conf/pkdd/AkogluF09, author = {Akoglu, Leman and Faloutsos, Christos}, booktitle = {ECML/PKDD (1)}, crossref = {conf/pkdd/2009-1}, date = {2009-08-31}, editor = {Buntine, Wray L. and Grobelnik, Marko and Mladenic, Dunja and Shawe-Taylor, John}, ee = {http://dx.doi.org/10.1007/978-3-642-04180-8_13}, interhash = {41cfe5a9af68deacdf3881536d5f1e0d}, intrahash = {3af4a53fd0b650b6914f89a208bdc753}, isbn = {978-3-642-04179-2}, pages = {13-28}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, title = {RTG: A Recursive Realistic Graph Generator Using Random Typing.}, url = {http://dblp.uni-trier.de/db/conf/pkdd/pkdd2009-1.html#AkogluF09}, volume = 5781, year = 2009 } @book{zipf32selective, author = {Zipf, G. K.}, interhash = {3c608b74ec727229ac166703d763d336}, intrahash = {09292a36e242c63a5327905d4a8779b1}, publisher = {Harvard University Press}, title = {Selective Studies and the Principle of Relative Frequency in Language}, year = 1932 } @article{li2005towards, author = {Li, L. and Alderson, D. and Doyle, J.C. and Willinger, W.}, interhash = {5f8381f5e7d915be9eb1682ac54d1c79}, intrahash = {5d2a7bda356317c8d6a6af168128ade1}, journal = {Internet Mathematics}, number = 4, pages = {431--523}, publisher = {AK Peters}, title = {{Towards a theory of scale-free graphs: Definition, properties, and implications}}, url = {http://scholar.google.de/scholar.bib?q=info:Xi5NYPJyMvMJ:scholar.google.com/&output=citation&hl=de&as_sdt=2000&ct=citation&cd=0}, volume = 2, year = 2005 } @misc{Clauset2007, abstract = { Power-law distributions occur in many situations of scientific interest and have significant consequences for our understanding of natural and man-made phenomena. Unfortunately, the detection and characterization of power laws is complicated by the large fluctuations that occur in the tail of the distribution -- the part of the distribution representing large but rare events -- and by the difficulty of identifying the range over which power-law behavior holds. Commonly used methods for analyzing power-law data, such as least-squares fitting, can produce substantially inaccurate estimates of parameters for power-law distributions, and even in cases where such methods return accurate answers they are still unsatisfactory because they give no indication of whether the data obey a power law at all. Here we present a principled statistical framework for discerning and quantifying power-law behavior in empirical data. Our approach combines maximum-likelihood fitting methods with goodness-of-fit tests based on the Kolmogorov-Smirnov statistic and likelihood ratios. We evaluate the effectiveness of the approach with tests on synthetic data and give critical comparisons to previous approaches. We also apply the proposed methods to twenty-four real-world data sets from a range of different disciplines, each of which has been conjectured to follow a power-law distribution. In some cases we find these conjectures to be consistent with the data while in others the power law is ruled out. }, author = {Clauset, Aaron and Shalizi, Cosma Rohilla and Newman, M. E. J.}, interhash = {2e3bc5bbd7449589e8bfb580e8936d4b}, intrahash = {7da1624e601898dd74df839ce2daeb24}, note = {cite arxiv:0706.1062 Comment: 43 pages, 11 figures, 7 tables, 4 appendices; code available at http://www.santafe.edu/~aaronc/powerlaws/}, title = {Power-law distributions in empirical data}, url = {http://arxiv.org/abs/0706.1062}, year = 2007 } @inproceedings{316229, abstract = {Despite the apparent randomness of the Internet, we discover some surprisingly simple power-laws of the Internet topology. These power-laws hold for three snapshots of the Internet, between November 1997 and December 1998, despite a 45% growth of its size during that period. We show that our power-laws fit the real data very well resulting in correlation coefficients of 96% or higher.Our observations provide a novel perspective of the structure of the Internet. The power-laws describe concisely skewed distributions of graph properties such as the node outdegree. In addition, these power-laws can be used to estimate important parameters such as the average neighborhood size, and facilitate the design and the performance analysis of protocols. Furthermore, we can use them to generate and select realistic topologies for simulation purposes.}, address = {New York, NY, USA}, author = {Faloutsos, Michalis and Faloutsos, Petros and Faloutsos, Christos}, booktitle = {SIGCOMM '99: Proceedings of the conference on Applications, technologies, architectures, and protocols for computer communication}, doi = {http://doi.acm.org/10.1145/316188.316229}, interhash = {c857837ff6687f3e6ccfffde7f9fad86}, intrahash = {5054ef1ebbe7d70f7eaf1d0431541b83}, isbn = {1-58113-135-6}, location = {Cambridge, Massachusetts, United States}, pages = {251--262}, publisher = {ACM}, title = {On power-law relationships of the Internet topology}, url = {http://portal.acm.org/citation.cfm?id=316229}, year = 1999 } @article{Goldstein04powerlawfitV1, abstract = {Version 1 of Goldstein 04 power law fit containing also the chi 2 test}, author = {Goldstein, M. L. and Morris, S. A. and Yen, G. G.}, interhash = {6216b964a64c9783e3bc22f46fa98a20}, intrahash = {ce8d5ffe96977fd45bd01d677e9cc17d}, journal = {The European Physical Journal B - Condensed Matter and Complex Systems}, number = 2, pages = {255-258}, title = {Fitting to the power-law distribution}, url = {http://arxiv.org/abs/cond-mat/0402322v1}, volume = 41, year = 2004 } @article{barabasi99emergence, author = {Barabási, Albert-László and Albert, Réka}, interhash = {a09e31f3baed5f4d2112ad182621f269}, intrahash = {73f0eae7189f539854c5d59bae595942}, journal = {Science}, pages = {509--512}, title = {Emergence of scaling in random networks}, volume = 286, year = 1999 } @misc{adamic02tutorial, author = {Adamic, Lada}, howpublished = {http://www.hpl.hp.com/research/idl/papers/ranking/ranking.html}, interhash = {3f519f2d8220a73cc688242352d96c08}, intrahash = {42678d6ad1776ec6134b251ff277deb1}, title = {Zipf, Power-laws, and Pareto -- a ranking tutorial }, url = {http://www.hpl.hp.com/research/idl/papers/ranking/ranking.html}, year = 2002 } @article{adamic02zipf, author = {Adamic, L. A. and Huberman, B. A.}, interhash = {4c4730944613c749c81bbe6d30b456d0}, intrahash = {179a38d414288ea6524ffdf0533ac9ab}, journal = {Glottometrics}, pages = {143-150}, title = {Zipf's Law and the Internet}, volume = 3, year = 2002 } @article{newman03structure, author = {Newman, M. E. J.}, interhash = {7bedd01cb4c06af9f5200b0fb3faa571}, intrahash = {72bc4532c466263fde0902faee84adff}, journal = {SIAM Review}, pages = 167, title = {The structure and function of complex networks}, url = {http://www.citebase.org/abstract?id=oai:arXiv.org:cond-mat/0303516}, volume = 45, year = 2003 } @article{newman05power, author = {Newman, M. E. J.}, interhash = {7539b701d6df3fb9a90b0ff70a32bfe9}, intrahash = {436d9c707f94b26bbee4187fdf714820}, journal = {Contemporary Physics}, pages = 323, title = {Power laws, Pareto distributions and Zipf's law}, url = {doi:10.1080/00107510500052444}, volume = 46, year = 2005 }