@article{khabsa2014number, abstract = {The number of scholarly documents available on the web is estimated using capture/recapture methods by studying the coverage of two major academic search engines: Google Scholar and Microsoft Academic Search. Our estimates show that at least 114 million English-language scholarly documents are accessible on the web, of which Google Scholar has nearly 100 million. Of these, we estimate that at least 27 million (24%) are freely available since they do not require a subscription or payment of any kind. In addition, at a finer scale, we also estimate the number of scholarly documents on the web for fifteen fields: Agricultural Science, Arts and Humanities, Biology, Chemistry, Computer Science, Economics and Business, Engineering, Environmental Sciences, Geosciences, Material Science, Mathematics, Medicine, Physics, Social Sciences, and Multidisciplinary, as defined by Microsoft Academic Search. In addition, we show that among these fields the percentage of documents defined as freely available varies significantly, i.e., from 12 to 50%.}, author = {Khabsa, Madian and Giles, C. Lee}, interhash = {61aed8da8eb53c7583d1f27e3cd8fa0c}, intrahash = {8fa9f00fb097a3fd6d0390152c848a37}, journal = {PLoS ONE}, month = may, number = 5, pages = {e93949}, title = {The number of scholarly documents on the public web}, url = {http://www.plosone.org/article/info%3Adoi%2F10.1371%2Fjournal.pone.0093949}, volume = 9, year = 2014 } @article{kataria2011context, abstract = {In a document network such as a citation network of scientific documents, web-logs etc., the content produced by authors exhibit their interest in certain topics. In addition some authors influence other authors' interests. In this work, we propose to model the influence of cited authors along with the interests of citing authors. Morover , we hypothesize that citations present in documents, the context surrounding the citation mention provides extra topical information about the cited authors. However, associating terms in the context to the cited authors remains an open problem. We propose novel document generation schemes that incorporate the context while simultaneously modeling the interests of citing authors and influence of the cited authors. Our experiments show significant improvements over baseline models for various evaluation criteria such as link prediction between document and cited author, and quantitatively explaining unseen text.}, author = {Kataria, Saurabh and Mitra, Prasenjit and Caragea, Cornelia and Giles, C.}, conference = {International Joint Conference on Artificial Intelligence}, interhash = {7496b4df1335fbc6aea691cecb65289d}, intrahash = {dc774d17ec721be6d32530d265f34539}, title = {Context Sensitive Topic Models for Author Influence in Document Networks}, url = {https://www.aaai.org/ocs/index.php/IJCAI/IJCAI11/paper/view/3140}, year = 2011 } @inproceedings{he2011citation, abstract = {Automatic recommendation of citations for a manuscript is highly valuable for scholarly activities since it can substantially improve the efficiency and quality of literature search. The prior techniques placed a considerable burden on users, who were required to provide a representative bibliography or to mark passages where citations are needed. In this paper we present a system that considerably reduces this burden: a user simply inputs a query manuscript (without a bibliography) and our system automatically finds locations where citations are needed. We show that naïve approaches do not work well due to massive noise in the document corpus. We produce a successful approach by carefully examining the relevance between segments in a query manuscript and the representative segments extracted from a document corpus. An extensive empirical evaluation using the CiteSeerX data set shows that our approach is effective.}, acmid = {1935926}, address = {New York, NY, USA}, author = {He, Qi and Kifer, Daniel and Pei, Jian and Mitra, Prasenjit and Giles, C. Lee}, booktitle = {Proceedings of the fourth ACM international conference on Web search and data mining}, doi = {10.1145/1935826.1935926}, interhash = {7e98aaf26a7ed6cc624249a3ab570d7a}, intrahash = {bbd320f03d13c6cfff4b6f9e6b4630f7}, isbn = {978-1-4503-0493-1}, location = {Hong Kong, China}, numpages = {10}, pages = {755--764}, publisher = {ACM}, title = {Citation recommendation without author supervision}, url = {http://doi.acm.org/10.1145/1935826.1935926}, year = 2011 } @inproceedings{lawrence1999indexing, abstract = {The web has greatly improved access to scientific literature. However, scientific articles on the web are largely disorganized, with research articles being spread across archive sites, institution sites, journal sites, and researcher homepages. No index covers all of the available literature, and the major web search engines typically do not index the content of Postscript/PDF documents at all. This paper discusses the creation of digital libraries of scientific literature on the web, including the efficient location of articles, full-text indexing of the articles, autonomous citation indexing, information extraction, display of query-sensitive summaries and citation context, hubs and authorities computation, similar document detection, user profiling, distributed error correction, graph analysis, and detection of overlapping documents. The software for the system is available at no cost for non-commercial use.}, acmid = {319970}, address = {New York, NY, USA}, author = {Lawrence, Steve and Bollacker, Kurt and Giles, C. Lee}, booktitle = {Proceedings of the eighth international conference on Information and knowledge management}, doi = {10.1145/319950.319970}, interhash = {09c20b905496b3fba782688018d948b0}, intrahash = {8f79ea9ca0db12c8bf853dcceed20eb3}, isbn = {1-58113-146-1}, location = {Kansas City, Missouri, United States}, numpages = {8}, pages = {139--146}, publisher = {ACM}, title = {Indexing and retrieval of scientific literature}, url = {http://doi.acm.org/10.1145/319950.319970}, year = 1999 } @article{song2011automatic, abstract = {The emergence of Web 2.0 and the consequent success of social network Web sites such as Del.icio.us and Flickr introduce us to a new concept called social bookmarking, or tagging. Tagging is the action of connecting a relevant user-defined keyword to a document, image, or video, which helps the user to better organize and share their collections of interesting stuff. With the rapid growth of Web 2.0, tagged data is becoming more and more abundant on the social network Web sites. An interesting problem is how to automate the process of making tag recommendations to users when a new resource becomes available.

In this article, we address the issue of tag recommendation from a machine learning perspective. From our empirical observation of two large-scale datasets, we first argue that the user-centered approach for tag recommendation is not very effective in practice. Consequently, we propose two novel document-centered approaches that are capable of making effective and efficient tag recommendations in real scenarios. The first, graph-based, method represents the tagged data in two bipartite graphs, (document, tag) and (document, word), then finds document topics by leveraging graph partitioning algorithms. The second, prototype-based, method aims at finding the most representative documents within the data collections and advocates a sparse multiclass Gaussian process classifier for efficient document classification. For both methods, tags are ranked within each topic cluster/class by a novel ranking method. Recommendations are performed by first classifying a new document into one or more topic clusters/classes, and then selecting the most relevant tags from those clusters/classes as machine-recommended tags.

Experiments on real-world data from Del.icio.us, CiteULike, and BibSonomy examine the quality of tag recommendation as well as the efficiency of our recommendation algorithms. The results suggest that our document-centered models can substantially improve the performance of tag recommendations when compared to the user-centered methods, as well as topic models LDA and SVM classifiers.}, address = {New York, NY, USA}, author = {Song, Yang and Zhang, Lu and Giles, C. Lee}, doi = {10.1145/1921591.1921595}, interhash = {6e93d08c935eaf887ed46750f405e742}, intrahash = {8e5fdf385f7bae639ca978259d9ec8de}, issn = {1559-1131}, journal = {Transactions on the Web}, month = feb, number = 1, pages = {1--31}, publisher = {ACM}, title = {Automatic tag recommendation algorithms for social recommender systems}, url = {http://doi.acm.org/10.1145/1921591.1921595}, volume = 5, year = 2011 } @inproceedings{347121, address = {New York, NY, USA}, author = {Flake, Gary William and Lawrence, Steve and Giles, C. Lee}, booktitle = {KDD '00: Proceedings of the sixth ACM SIGKDD international conference on Knowledge discovery and data mining}, doi = {http://doi.acm.org/10.1145/347090.347121}, interhash = {e74be2040258b24f3b2e03466931a9da}, intrahash = {b37bffe4a02dace7c303d663fd24182c}, isbn = {1-58113-233-6}, location = {Boston, Massachusetts, United States}, pages = {150--160}, publisher = {ACM Press}, title = {Efficient identification of Web communities}, year = 2000 } @article{pennock2002winners, author = {Pennock, David and Flake, Gary and Lawrence, Steve and Glover, Eric and Giles, C. Lee}, interhash = {1a0fa8a805c65f5a4096627c1e019da4}, intrahash = {10554994432471894ca93bd8a0493e17}, journal = {Proc.\ National Academy of Sciences}, misc = {comment = {Lokal vorhanden; PLOD-Algorithmus -> Faloutsos}}, month = {April}, number = 8, pages = {5207--5211}, title = {Winners don't take all: Characterizing the competition for links on the web}, volume = 99, year = 2002 } @article{1304546, abstract = {Social bookmarking services have recently gained popularity among Web users. Whereas numerous studies provide a historical account of tagging systems, the authors use their analysis of a domain-specific social bookmarking service called CiteULike to reflect on two metrics for evaluating tagging behavior: tag growth and tag reuse. They examine the relationship between these two metrics and articulate design implications for enhancing social bookmarking services. The authors also briefly reflect on their own work on developing a social bookmarking service for CiteSeer, an online scholarly digital library for computer science.}, address = {Piscataway, NJ, USA}, author = {Farooq, Umer and Song, Yang and Carroll, John M. and Giles, C. Lee}, doi = {http://dx.doi.org/10.1109/MIC.2007.135}, interhash = {13183e8fc4cbe0944a819afa2d9ff4eb}, intrahash = {5785e8a8064b3d346f8c198c3c860bf6}, issn = {1089-7801}, journal = {IEEE Internet Computing}, number = 6, pages = {29--35}, publisher = {IEEE Educational Activities Department}, title = {Social Bookmarking for Scholarly Digital Libraries}, url = {http://portal.acm.org/citation.cfm?id=1304546&coll=Portal&dl=GUIDE&CFID=46454031&CFTOKEN=27530397}, volume = 11, year = 2007 } @inproceedings{1390423, address = {New York, NY, USA}, author = {Song, Yang and Zhuang, Ziming and Li, Huajing and Zhao, Qiankun and Li, Jia and Lee, Wang-Chien and Giles, C. Lee}, booktitle = {SIGIR '08: Proceedings of the 31st annual international ACM SIGIR conference on Research and development in information retrieval}, doi = {http://doi.acm.org/10.1145/1390334.1390423}, interhash = {e6505664e875de06d98a6e787d4367d1}, intrahash = {525a37f6ef3d81a81686b515a148b88b}, isbn = {978-1-60558-164-4}, location = {Singapore, Singapore}, pages = {515--522}, publisher = {ACM}, title = {Real-time automatic tag recommendation}, url = {http://portal.acm.org/citation.cfm?id=1390334.1390423}, year = 2008 } @inproceedings{1458098, address = {New York, NY, USA}, author = {Song, Yang and Zhang, Lu and Giles, C. Lee}, booktitle = {CIKM '08: Proceeding of the 17th ACM conference on Information and knowledge mining}, doi = {http://doi.acm.org/10.1145/1458082.1458098}, interhash = {5c03bc1e658b6d44f053944418bdaec3}, intrahash = {d330a3537b4a14fbd40661424ec8e465}, isbn = {978-1-59593-991-3}, location = {Napa Valley, California, USA}, pages = {93--102}, publisher = {ACM}, title = {A sparse gaussian processes classification framework for fast tag suggestions}, url = {http://portal.acm.org/citation.cfm?id=1458098}, year = 2008 } @article{pennock2002winners, author = {Pennock, David and Flake, Gary and Lawrence, Steve and Glover, Eric and Giles, C. Lee}, interhash = {1a0fa8a805c65f5a4096627c1e019da4}, intrahash = {10554994432471894ca93bd8a0493e17}, journal = {Proc.\ National Academy of Sciences}, misc = {comment = {Lokal vorhanden; PLOD-Algorithmus -> Faloutsos}}, month = {April}, number = 8, pages = {5207--5211}, title = {Winners don't take all: Characterizing the competition for links on the web}, volume = 99, year = 2002 }