@article{cho2006stanford, abstract = {We describe the design and performance of WebBase, a tool for Web research. The system includes a highly customizable crawler, a repository for collected Web pages, an indexer for both text and link-related page features, and a high-speed content distribution facility. The distribution module enables researchers world-wide to retrieve pages from WebBase, and stream them across the Internet at high speed. The advantage for the researchers is that they need not all crawl the Web before beginning their research. WebBase has been used by scores of research and teaching organizations world-wide, mostly for investigations into Web topology and linguistic content analysis. After describing the system's architecture, we explain our engineering decisions for each of the WebBase components, and present respective performance measurements.}, acmid = {1149124}, address = {New York, NY, USA}, author = {Cho, Junghoo and Garcia-Molina, Hector and Haveliwala, Taher and Lam, Wang and Paepcke, Andreas and Raghavan, Sriram and Wesley, Gary}, doi = {10.1145/1149121.1149124}, interhash = {bebbc072ea2dccf4c2b27abf244c1f08}, intrahash = {3cd21bf8a87619e0489b8da177c9f0b4}, issn = {1533-5399}, issue_date = {May 2006}, journal = {ACM Transactions on Internet Technology}, month = may, number = 2, numpages = {34}, pages = {153--186}, publisher = {ACM}, title = {Stanford WebBase components and applications}, url = {http://doi.acm.org/10.1145/1149121.1149124}, volume = 6, year = 2006 } @article{stirling2012archives, abstract = {The Internet has been covered by legal deposit legislation in France since 2006, making web archiving one of the missions of the Bibliothèque nationale de France (BnF). Access to the web archives has been provided in the library on an experimental basis since 2008. In the context of increasing interest in many countries in web archiving and how it may best serve the needs of researchers, especially in the expanding field of Internet studies for social sciences, a qualitative study was performed, based on interviews with potential users of the web archives held at the BnF, and particularly researchers working in various areas related to the Internet. The study aimed to explore their needs in terms of both content and services, and also to analyse different ways of representing the archives, in order to identify ways of increasing their use. While the interest of maintaining the "memory" of the web is obvious to the researchers, they are faced with the difficulty of defining, in what is a seemingly limitless space, meaningful collections of documents. Cultural heritage institutions such as national libraries are perceived as trusted third parties capable of creating rationally-constructed and well-documented collections, but such archives raise certain ethical and methodological questions.}, author = {Stirling, Peter and Chevallier, Philippe and Illien, Gildas}, doi = {10.1045/march2012-stirling}, interhash = {a783191c99a285197525595ebf509bb2}, intrahash = {4f7840193e7e435ad5dd0003fc93691a}, issn = {1082-9873}, journal = {D-Lib Magazine}, month = {March/April }, number = {3/4}, title = {Web Archives for Researchers: Representations, Expectations and Potential Uses}, url = {http://www.dlib.org/dlib/march12/stirling/03stirling.html}, volume = 18, year = 2012 } @inproceedings{wang2010claper, abstract = {Classical papers are of great help for beginners to get familiar with a new research area. However, digging them out is a difficult problem. This paper proposes Claper, a novel academic recommendation system based on two proven principles: the Principle of Download Persistence and the Principle of Citation Approaching (we prove them based on real-world datasets). The principle of download persistence indicates that classical papers have few decreasing download frequencies since they were published. The principle of citation approaching indicates that a paper which cites a classical paper is likely to cite citations of that classical paper. Our experimental results based on large-scale real-world datasets illustrate Claper can effectively recommend classical papers of high quality to beginners and thus help them enter their research areas.}, author = {Wang, Yonggang and Zhai, Ennan and Hu, Jianbin and Chen, Zhong}, booktitle = {Proceedings of the seventh International Conference on Fuzzy Systems and Knowledge Discovery}, doi = {10.1109/FSKD.2010.5569227}, interhash = {7180ddaf1c1765a45fd244027bd0bf43}, intrahash = {7da72bf2f0538afad9377a0d50c263b4}, month = aug, pages = {2777--2781}, publisher = {IEEE}, title = {Claper: Recommend classical papers to beginners}, url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=5569227}, volume = 6, year = 2010 } @inproceedings{he2011citation, abstract = {Automatic recommendation of citations for a manuscript is highly valuable for scholarly activities since it can substantially improve the efficiency and quality of literature search. The prior techniques placed a considerable burden on users, who were required to provide a representative bibliography or to mark passages where citations are needed. In this paper we present a system that considerably reduces this burden: a user simply inputs a query manuscript (without a bibliography) and our system automatically finds locations where citations are needed. We show that naïve approaches do not work well due to massive noise in the document corpus. We produce a successful approach by carefully examining the relevance between segments in a query manuscript and the representative segments extracted from a document corpus. An extensive empirical evaluation using the CiteSeerX data set shows that our approach is effective.}, acmid = {1935926}, address = {New York, NY, USA}, author = {He, Qi and Kifer, Daniel and Pei, Jian and Mitra, Prasenjit and Giles, C. Lee}, booktitle = {Proceedings of the fourth ACM international conference on Web search and data mining}, doi = {10.1145/1935826.1935926}, interhash = {7e98aaf26a7ed6cc624249a3ab570d7a}, intrahash = {bbd320f03d13c6cfff4b6f9e6b4630f7}, isbn = {978-1-4503-0493-1}, location = {Hong Kong, China}, numpages = {10}, pages = {755--764}, publisher = {ACM}, title = {Citation recommendation without author supervision}, url = {http://doi.acm.org/10.1145/1935826.1935926}, year = 2011 } @inproceedings{bethard2010should, abstract = {Scientists depend on literature search to find prior work that is relevant to their research ideas. We introduce a retrieval model for literature search that incorporates a wide variety of factors important to researchers, and learns the weights of each of these factors by observing citation patterns. We introduce features like topical similarity and author behavioral patterns, and combine these with features from related work like citation count and recency of publication. We present an iterative process for learning weights for these features that alternates between retrieving articles with the current retrieval model, and updating model weights by training a supervised classifier on these articles. We propose a new task for evaluating the resulting retrieval models, where the retrieval system takes only an abstract as its input and must produce as output the list of references at the end of the abstract's article. We evaluate our model on a collection of journal, conference and workshop articles from the ACL Anthology Reference Corpus. Our model achieves a mean average precision of 28.7, a 12.8 point improvement over a term similarity baseline, and a significant improvement both over models using only features from related work and over models without our iterative learning.}, acmid = {1871517}, address = {New York, NY, USA}, author = {Bethard, Steven and Jurafsky, Dan}, booktitle = {Proceedings of the 19th ACM international conference on Information and knowledge management}, doi = {10.1145/1871437.1871517}, interhash = {1cdf6c7da38af251279e9fb915266af2}, intrahash = {369206c7472baeaa5ecefef586e16c6a}, isbn = {978-1-4503-0099-5}, location = {Toronto, ON, Canada}, numpages = {10}, pages = {609--618}, publisher = {ACM}, title = {Who should I cite: learning literature search models from citation behavior}, url = {http://doi.acm.org/10.1145/1871437.1871517}, year = 2010 } @inproceedings{Strohman:2007:RCA:1277741.1277868, abstract = {We approach the problem of academic literature search by considering an unpublished manuscript as a query to a search system. We use the text of previous literature as well as the citation graph that connects it to find relevant related material. We evaluate our technique with manual and automatic evaluation methods, and find an order of magnitude improvement in mean average precision as compared to a text similarity baseline.}, acmid = {1277868}, address = {New York, NY, USA}, author = {Strohman, Trevor and Croft, W. Bruce and Jensen, David}, booktitle = {Proceedings of the 30th annual international ACM SIGIR conference on Research and development in information retrieval}, doi = {10.1145/1277741.1277868}, interhash = {a34279add7d7a9f3c564735b7b8dcd44}, intrahash = {7a0b1ff2a40b3989ef8d83daabd91159}, isbn = {978-1-59593-597-7}, location = {Amsterdam, The Netherlands}, numpages = {2}, pages = {705--706}, publisher = {ACM}, title = {Recommending citations for academic papers}, url = {http://doi.acm.org/10.1145/1277741.1277868}, year = 2007 }