@article{demvsar2006statistical, abstract = {While methods for comparing two learning algorithms on a single data set have been scrutinized for quite some time already, the issue of statistical tests for comparisons of more algorithms on multiple data sets, which is even more essential to typical machine learning studies, has been all but ignored. This article reviews the current practice and then theoretically and empirically examines several suitable tests. Based on that, we recommend a set of simple, yet safe and robust non-parametric tests for statistical comparisons of classifiers: the Wilcoxon signed ranks test for comparison of two classifiers and the Friedman test with the corresponding post-hoc tests for comparison of more classifiers over multiple data sets. Results of the latter can also be neatly presented with the newly introduced CD (critical difference) diagrams.}, acmid = {1248548}, author = {Dem\v{s}ar, Janez}, interhash = {337f48d386c60bd13ce70021894680ef}, intrahash = {93751bd0bfabffe38f799b9bb7f4c227}, issn = {1532-4435}, issue_date = {12/1/2006}, journal = {J. Mach. Learn. Res.}, month = dec, numpages = {30}, pages = {1--30}, publisher = {JMLR.org}, title = {Statistical Comparisons of Classifiers over Multiple Data Sets}, url = {http://dl.acm.org/citation.cfm?id=1248547.1248548}, volume = 7, year = 2006 } @article{brody2006earlier, abstract = {The use of citation counts to assess the impact of research articles is well established. However, the citation impact of an article can only be measured several years after it has been published. As research articles are increasingly accessed through the Web, the number of times an article is downloaded can be instantly recorded and counted. One would expect the number of times an article is read to be related both to the number of times it is cited and to how old the article is. The authors analyze how short-term Web usage impact predicts medium-term citation impact. The physics e-print archive—arXiv.org—is used to test this.}, author = {Brody, Tim and Harnad, Stevan and Carr, Leslie}, doi = {10.1002/asi.20373}, interhash = {b4ae997250ae110bcc89826cb2a8205c}, intrahash = {643ec09ec9d1fd641c0416c3d8dde8f6}, issn = {1532-2890}, journal = {Journal of the American Society for Information Science and Technology}, number = 8, pages = {1060--1072}, publisher = {Wiley Subscription Services, Inc., A Wiley Company}, title = {Earlier Web usage statistics as predictors of later citation impact}, url = {http://dx.doi.org/10.1002/asi.20373}, volume = 57, year = 2006 } @inproceedings{saeed2008citation, abstract = {New developments in the collaborative and participatory role of Web has emerged new web based fast lane information systems like tagging and bookmarking applications. Same authors have shown elsewhere, that for same papers tags and bookmarks appear and gain volume very quickly in time as compared to citations and also hold good correlation with the citations. Studying the rank prediction models based on these systems gives advantage of gaining quick insight and localizing the highly productive and diffusible knowledge very early in time. This shows that it may be interesting to model the citation rank of a paper within the scope of a conference or journal issue, based on the bookmark counts (i-e count representing how many researchers have shown interest in a publication.) We used linear regression model for predicting citation ranks and compared both predicted citation rank models of bookmark counts and coauthor network counts for the papers of WWW06 conference. The results show that the rank prediction model based on bookmark counts is far better than the one based on coauthor network with mean absolute error for the first limited to the range of 5 and mean absolute error for second model above 18. Along with this we also compared the two bookmark prediction models out of which one was based on total citations rank as a dependent variable and the other was based on the adjusted citation rank. The citation rank was adjusted after subtracting the self and coauthor citations from total citations. The comparison reveals a significant improvement in the model and correlation after adjusting the citation rank. This may be interpreted that the bookmarking mechanisms represents the phenomenon similar to global discovery of a publication. While in the coauthor nets the papers are communicated personally and this communication or selection may not be captured within the bookmarking systems.}, author = {Saeed, A.U. and Afzal, M.T. and Latif, A. and Tochtermann, K.}, booktitle = {Multitopic Conference, 2008. INMIC 2008. IEEE International}, doi = {10.1109/INMIC.2008.4777769}, interhash = {26d1785cab132d577e377bb5bf299002}, intrahash = {677fc89fef6c79a6a4f25cb25246e38a}, month = dec, pages = {392-397}, title = {Citation rank prediction based on bookmark counts: Exploratory case study of WWW06 papers}, url = {http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=4777769}, year = 2008 } @inproceedings{scholz2013people, author = {Scholz, Christoph and Atzmueller, Martin and Kibanov, Mark and Stumme, Gerd}, booktitle = {ASONAM}, interhash = {8b6051b794789000c4baa5ab059fab18}, intrahash = {bf958ff2b11df1d9d15d9287ea07a5c9}, title = {How Do People Link? Analysis of Contact Structures in Human Face-to-Face Proximity Networks}, year = 2013 } @inproceedings{christophscholzandmartinatzmuellerandalainbarratandcirocattutoandgerdstumme2013insights, address = {Palo Alto, CA, USA}, author = {{Christoph Scholz and Martin Atzmueller and Alain Barrat and Ciro Cattuto and Gerd Stumme}}, booktitle = {Proc. 7th Intl. AAAI Conference on Weblogs and Social Media}, interhash = {18e166abcc948c86d6d4e9de9b204760}, intrahash = {3dce3522e260238f2a8e802b52096cb7}, optpages = {To appear}, publisher = {AAAI Press}, title = {{New Insights and Methods For Predicting Face-To-Face Contacts}}, year = 2013 } @article{fu2010using, abstract = {The most popular method for judging the impact of biomedical articles is citation count which is the number of citations received. The most significant limitation of citation count is that it cannot evaluate articles at the time of publication since citations accumulate over time. This work presents computer models that accurately predict citation counts of biomedical publications within a deep horizon of 10 years using only predictive information available at publication time. Our experiments show that it is indeed feasible to accurately predict future citation counts with a mixture of content-based and bibliometric features using machine learning methods. The models pave the way for practical prediction of the long-term impact of publication, and their statistical analysis provides greater insight into citation behavior.}, affiliation = {Center for Health Informatics and Bioinformatics, New York University Medical Center, 333 E. 38th St, 6th Floor, New York, NY 10016, USA}, author = {Fu, Lawrence D. and Aliferis, Constantin F.}, doi = {10.1007/s11192-010-0160-5}, interhash = {5502184494caab8c56056b7a9d92cb15}, intrahash = {e45088bdacbda5a5e8e6f293dcbca995}, issn = {0138-9130}, journal = {Scientometrics}, keyword = {Computer Science}, number = 1, pages = {257-270}, publisher = {Akadémiai Kiadó, co-published with Springer Science+Business Media B.V., Formerly Kluwer Academic Publishers B.V.}, title = {Using content-based and bibliometric features for machine learning models to predict citation counts in the biomedical literature}, url = {http://dx.doi.org/10.1007/s11192-010-0160-5}, volume = 85, year = 2010 } @inproceedings{Yan:2012:BSS:2232817.2232831, abstract = {Usually scientists breed research ideas inspired by previous publications, but they are unlikely to follow all publications in the unbounded literature collection. The volume of literature keeps on expanding extremely fast, whilst not all papers contribute equal impact to the academic society. Being aware of potentially influential literature would put one in an advanced position in choosing important research references. Hence, estimation of potential influence is of great significance. We study a challenging problem of identifying potentially influential literature. We examine a set of hypotheses on what are the fundamental characteristics for highly cited papers and find some interesting patterns. Based on these observations, we learn to identify potentially influential literature via Future Influence Prediction (FIP), which aims to estimate the future influence of literature. The system takes a series of features of a particular publication as input and produces as output the estimated citation counts of that article after a given time period. We consider several regression models to formulate the learning process and evaluate their performance based on the coefficient of determination (R2). Experimental results on a real-large data set show a mean average predictive performance of 83.6% measured in R^2. We apply the learned model to the application of bibliography recommendation and obtain prominent performance improvement in terms of Mean Average Precision (MAP).}, acmid = {2232831}, address = {New York, NY, USA}, author = {Yan, Rui and Huang, Congrui and Tang, Jie and Zhang, Yan and Li, Xiaoming}, booktitle = {Proceedings of the 12th ACM/IEEE-CS joint conference on Digital Libraries}, doi = {10.1145/2232817.2232831}, interhash = {85d10c6d37bcbfa057c51acc325a8116}, intrahash = {9269d2dd9bf4bc8c0e7c668011fcfc1b}, isbn = {978-1-4503-1154-0}, location = {Washington, DC, USA}, numpages = {10}, pages = {51--60}, publisher = {ACM}, series = {JCDL '12}, title = {To better stand on the shoulder of giants}, url = {http://doi.acm.org/10.1145/2232817.2232831}, year = 2012 } @inproceedings{yan2011citation, abstract = {In most of the cases, scientists depend on previous literature which is relevant to their research fields for developing new ideas. However, it is not wise, nor possible, to track all existed publications because the volume of literature collection grows extremely fast. Therefore, researchers generally follow, or cite merely a small proportion of publications which they are interested in. For such a large collection, it is rather interesting to forecast which kind of literature is more likely to attract scientists' response. In this paper, we use the citations as a measurement for the popularity among researchers and study the interesting problem of Citation Count Prediction (CCP) to examine the characteristics for popularity. Estimation of possible popularity is of great significance and is quite challenging. We have utilized several features of fundamental characteristics for those papers that are highly cited and have predicted the popularity degree of each literature in the future. We have implemented a system which takes a series of features of a particular publication as input and produces as output the estimated citation counts of that article after a given time period. We consider several regression models to formulate the learning process and evaluate their performance based on the coefficient of determination (R-square). Experimental results on a real-large data set show that the best predictive model achieves a mean average predictive performance of 0.740 measured in R-square, which significantly outperforms several alternative algorithms.}, acmid = {2063757}, address = {New York, NY, USA}, author = {Yan, Rui and Tang, Jie and Liu, Xiaobing and Shan, Dongdong and Li, Xiaoming}, booktitle = {Proceedings of the 20th ACM international conference on Information and knowledge management}, doi = {10.1145/2063576.2063757}, interhash = {71ec0933a36df3dd21f38285bdf9b1b0}, intrahash = {b0caabb6e17d9b790d3f13c897330aad}, isbn = {978-1-4503-0717-8}, location = {Glasgow, Scotland, UK}, numpages = {6}, pages = {1247--1252}, publisher = {ACM}, series = {CIKM '11}, title = {Citation count prediction: learning to estimate future citations for literature}, url = {http://doi.acm.org/10.1145/2063576.2063757}, year = 2011 } @inproceedings{SAS:12, address = {Boston, MA, USA}, author = {Scholz, Christoph and Atzmueller, Martin and Stumme, Gerd}, booktitle = {Proc. Fourth ASE/IEEE International Conference on Social Computing (SocialCom)}, interhash = {9bc5d42018dbe8b926be214190258b3c}, intrahash = {be5ae4b92170e7c595f5fdcac15b4786}, publisher = {IEEE Computer Society}, title = {{On the Predictability of Human Contacts: Influence Factors and the Strength of Stronger Ties}}, url = {http://www.kde.cs.uni-kassel.de/atzmueller/paper/scholz-on-f2f-predictability-socialcom-2012.pdf}, year = 2012 } @article{shannon1951prediction, author = {Shannon, Claude Elwood}, interhash = {daabc21c7f6e71f6e78a10c8d3492927}, intrahash = {2e79cf0f6022645a632b13e081b0b035}, journal = {Bell System Technical Journal}, month = jan, pages = {50--64}, title = {Prediction and Entropy of Printed English}, url = {http://languagelog.ldc.upenn.edu/myl/Shannon1950.pdf}, volume = 30, year = 1951 } @inproceedings{milne2008learning, abstract = {This paper describes how to automatically cross-reference documents with Wikipedia: the largest knowledge base ever known. It explains how machine learning can be used to identify significant terms within unstructured text, and enrich it with links to the appropriate Wikipedia articles. The resulting link detector and disambiguator performs very well, with recall and precision of almost 75%. This performance is constant whether the system is evaluated on Wikipedia articles or "real world" documents.

This work has implications far beyond enriching documents with explanatory links. It can provide structured knowledge about any unstructured fragment of text. Any task that is currently addressed with bags of words - indexing, clustering, retrieval, and summarization to name a few - could use the techniques described here to draw on a vast network of concepts and semantics.}, address = {New York, NY, USA}, author = {Milne, David and Witten, Ian H.}, booktitle = {Proceedings of the 17th ACM conference on Information and knowledge management}, doi = {10.1145/1458082.1458150}, interhash = {44159e289485110212602792e72bbd74}, intrahash = {fd9cd6bbf302731d5af3f6e748cdb359}, isbn = {978-1-59593-991-3}, location = {Napa Valley, California, USA}, pages = {509--518}, publisher = {ACM}, title = {Learning to link with wikipedia}, url = {http://doi.acm.org/10.1145/1458082.1458150}, year = 2008 } @article{fu2008models, abstract = {The single most important bibliometric criterion for judging the impact of biomedical papers and their authors work is the number of citations received which is commonly referred to as citation count. This metric however is unavailable until several years after publication time. In the present work, we build computer models that accurately predict citation counts of biomedical publications within a deep horizon of ten years using only predictive information available at publication time. Our experiments show that it is indeed feasible to accurately predict future citation counts with a mixture of content-based and bibliometric features using machine learning methods. The models pave the way for practical prediction of the long-term impact of publication, and their statistical analysis provides greater insight into citation behavior.}, author = {Fu, Lawrence D. and Aliferis, Constantin}, interhash = {1eb972fa9ba9e255d6889b01532ea767}, intrahash = {39d155a532108bc71437451e31287943}, journal = {AMIA Annu Symp Proc}, pages = {222-226}, pmid = {18999029}, title = {Models for predicting and explaining citation count of biomedical articles}, url = {http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2656101/}, year = 2008 } @article{lokker2008prediction, author = {Lokker, Cynthia and McKibbon, K Ann and McKinlay, R James and Wilczynski, Nancy L and Haynes, R Brian}, doi = {10.1136/bmj.39482.526713.BE}, interhash = {f5f066ee09051d862c1a1c9f34a832c0}, intrahash = {dece3577294846d48f198a6a5e6425c2}, journal = {BMJ}, month = {3}, number = 7645, pages = {655--657}, title = {Prediction of citation counts for clinical articles at two years using data available within three weeks of publication: retrospective cohort study}, volume = 336, year = 2008 } @article{hirsch2007index, abstract = {Bibliometric measures of individual scientific achievement are of particular interest if they can be used to predict future achievement. Here we report results of an empirical study of the predictive power of the h index compared with other indicators. Our findings indicate that the h index is better than other indicators considered (total citation count, citations per paper, and total paper count) in predicting future scientific achievement. We discuss reasons for the superiority of the h index.}, author = {Hirsch, J. E.}, doi = {10.1073/pnas.0707962104}, eprint = {http://www.pnas.org/content/104/49/19193.full.pdf+html}, interhash = {9bc6518ef60bb256ca78287a6c349f05}, intrahash = {43caaad4f117fc3f5c14d83b9082448e}, journal = {Proceedings of the National Academy of Sciences}, number = 49, pages = {19193-19198}, title = {Does the h index have predictive power?}, url = {http://www.pnas.org/content/104/49/19193.abstract}, volume = 104, year = 2007 } @inproceedings{yan2011citation, abstract = {In most of the cases, scientists depend on previous literature which is relevant to their research fields for developing new ideas. However, it is not wise, nor possible, to track all existed publications because the volume of literature collection grows extremely fast. Therefore, researchers generally follow, or cite merely a small proportion of publications which they are interested in. For such a large collection, it is rather interesting to forecast which kind of literature is more likely to attract scientists' response. In this paper, we use the citations as a measurement for the popularity among researchers and study the interesting problem of Citation Count Prediction (CCP) to examine the characteristics for popularity. Estimation of possible popularity is of great significance and is quite challenging. We have utilized several features of fundamental characteristics for those papers that are highly cited and have predicted the popularity degree of each literature in the future. We have implemented a system which takes a series of features of a particular publication as input and produces as output the estimated citation counts of that article after a given time period. We consider several regression models to formulate the learning process and evaluate their performance based on the coefficient of determination (R-square). Experimental results on a real-large data set show that the best predictive model achieves a mean average predictive performance of 0.740 measured in R-square, which significantly outperforms several alternative algorithms.}, acmid = {2063757}, address = {New York, NY, USA}, author = {Yan, Rui and Tang, Jie and Liu, Xiaobing and Shan, Dongdong and Li, Xiaoming}, booktitle = {Proceedings of the 20th ACM international conference on Information and knowledge management}, doi = {10.1145/2063576.2063757}, interhash = {71ec0933a36df3dd21f38285bdf9b1b0}, intrahash = {b0caabb6e17d9b790d3f13c897330aad}, isbn = {978-1-4503-0717-8}, location = {Glasgow, Scotland, UK}, numpages = {6}, pages = {1247--1252}, publisher = {ACM}, series = {CIKM '11}, title = {Citation count prediction: learning to estimate future citations for literature}, url = {http://doi.acm.org/10.1145/2063576.2063757}, year = 2011 } @inproceedings{gaugaz2012predicting, abstract = {The amount of news content on the Web is increasing: Users can access news articles coming from a variety of sources on the Web: from newswires, news agencies, blogs, and at various places, e.g. even within Web search engines result pages. Anyhow, it still is a challenge for current search engines to decide which news events are worth being shown to the user (either for a newsworthy query or in a news portal). In this paper we define the task of predicting the future impact of news events. Being able to predict event impact will, for example, enable a newspaper to decide whether to follow a specific event or not, or a news search engine which stories to display. We define a flexible framework that, given some definition of impact, can predict its future development at the beginning of the event. We evaluate several possible definitions of event impact and experimentally identify the best features for each of them.}, author = {Gaugaz, Julien and Siehndel, Patrick and Demartini, Gianluca and Iofciu, Tereza and Georgescu, Mihai and Henze, Nicola}, booktitle = {Proc. of the 34th European Conference on Information Retrieval (ECIR 2012)}, interhash = {dc898856b5a18bf1cb9307d1bd9b5268}, intrahash = {f29c05f9a4fc3bb2189a965d95f622f9}, location = {Barcelona, Spain}, month = apr, title = {Predicting the Future Impact of News Events}, url = {http://www.l3s.de/web/page25g.do?kcond12g.att1=1833}, year = 2012 } @article{Song19022010, abstract = {A range of applications, from predicting the spread of human and electronic viruses to city planning and resource management in mobile communications, depend on our ability to foresee the whereabouts and mobility of individuals, raising a fundamental question: To what degree is human behavior predictable? Here we explore the limits of predictability in human dynamics by studying the mobility patterns of anonymized mobile phone users. By measuring the entropy of each individual’s trajectory, we find a 93% potential predictability in user mobility across the whole user base. Despite the significant differences in the travel patterns, we find a remarkable lack of variability in predictability, which is largely independent of the distance users cover on a regular basis.}, author = {Song, Chaoming and Qu, Zehui and Blumm, Nicholas and Barabási, Albert-László}, doi = {10.1126/science.1177170}, eprint = {http://www.sciencemag.org/content/327/5968/1018.full.pdf}, interhash = {f2611a08bf6db54f86e884c05f3cb5fb}, intrahash = {a89330f8eb32ce62b5f5c9a2b4909f25}, journal = {Science}, number = 5968, pages = {1018-1021}, title = {Limits of Predictability in Human Mobility}, url = {http://www.sciencemag.org/content/327/5968/1018.abstract}, volume = 327, year = 2010 } @misc{asur2010predicting, abstract = {In recent years, social media has become ubiquitous and important for socialnetworking and content sharing. And yet, the content that is generated fromthese websites remains largely untapped. In this paper, we demonstrate howsocial media content can be used to predict real-world outcomes. In particular,we use the chatter from Twitter.com to forecast box-office revenues for movies.We show that a simple model built from the rate at which tweets are createdabout particular topics can outperform market-based predictors. We furtherdemonstrate how sentiments extracted from Twitter can be further utilized toimprove the forecasting power of social media.}, author = {Asur, Sitaram and Huberman, Bernardo A.}, file = {asur2010predicting.pdf:asur2010predicting.pdf:PDF}, groups = {public}, interhash = {538607d6d5da7946a0c5a2114a7c44f5}, intrahash = {9c23c0465529a60d9540ee29e74856f1}, note = {cite arxiv:1003.5699}, timestamp = {2010-11-09 10:12:57}, title = {Predicting the Future with Social Media}, url = {http://arxiv.org/abs/1003.5699}, username = {dbenz}, year = 2010 } @article{song2010limits, abstract = {A range of applications, from predicting the spread of human and electronic viruses to city planning and resource management in mobile communications, depend on our ability to foresee the whereabouts and mobility of individuals, raising a fundamental question: To what degree is human behavior predictable? Here we explore the limits of predictability in human dynamics by studying the mobility patterns of anonymized mobile phone users. By measuring the entropy of each individual's trajectory, we find a 93% potential predictability in user mobility across the whole user base. Despite the significant differences in the travel patterns, we find a remarkable lack of variability in predictability, which is largely independent of the distance users cover on a regular basis.}, author = {Song, Chaoming and Qu, Zehui and Blumm, Nicholas and Barabási, Albert-László}, doi = {10.1126/science.1177170}, eprint = {http://www.sciencemag.org/cgi/reprint/327/5968/1018.pdf}, interhash = {f2611a08bf6db54f86e884c05f3cb5fb}, intrahash = {a89330f8eb32ce62b5f5c9a2b4909f25}, journal = {Science}, number = 5968, pages = {1018--1021}, title = {Limits of Predictability in Human Mobility}, url = {http://www.barabasilab.com/pubs/CCNR-ALB_Publications/201002-19_Science-Predictability/201002-19_Science-Predictability.pdf}, volume = 327, year = 2010 } @article{1117458, abstract = {Event-based network data consists of sets of events over time, each of which may involve multiple entities. Examples include email traffic, telephone calls, and research publications (interpreted as co-authorship events). Traditional network analysis techniques, such as social network models, often aggregate the relational information from each event into a single static network. In contrast, in this paper we focus on the temporal nature of such data. In particular, we look at the problems of temporal link prediction and node ranking, and describe new methods that illustrate opportunities for data mining and machine learning techniques in this context. Experimental results are discussed for a large set of co-authorship events measured over multiple years, and a large corporate email data set spanning 21 months.}, address = {New York, NY, USA}, author = {O'Madadhain, Joshua and Hutchins, Jon and Smyth, Padhraic}, doi = {10.1145/1117454.1117458}, interhash = {97a718ab9fe24625f7389939d2608d31}, intrahash = {89a23b31a476c4f3f771b5e3e4a8432c}, issn = {1931-0145}, journal = {SIGKDD Explor. Newsl.}, number = 2, pages = {23--30}, publisher = {ACM}, title = {Prediction and ranking algorithms for event-based network data}, url = {http://portal.acm.org/citation.cfm?id=1117458}, volume = 7, year = 2005 }