@book{manning2008, author = {Manning, Christopher D. and Raghavan, Prabhakar and Schütze, Hinrich}, interhash = {2e574e46b7668a7268e7f02b46f4d9bb}, intrahash = {9f4ab13e07b48b9723113aa74224be65}, publisher = {Cambridge University Press}, title = {Introduction to Information Retrieval}, year = 2008 } @book{manning2008introduction, abstract = {"Class-tested and coherent, this textbook teaches classical and web information retrieval, including web search and the related areas of text classification and text clustering from basic concepts. It gives an up-to-date treatment of all aspects of the design and implementation of systems for gathering, indexing, and searching documents; methods for evaluating systems; and an introduction to the use of machine learning methods on text collections. All the important ideas are explained using examples and figures, making it perfect for introductory courses in information retrieval for advanced undergraduates and graduate students in computer science. Based on feedback from extensive classroom experience, the book has been carefully structured in order to make teaching more natural and effective. Slides and additional exercises (with solutions for lecturers) are also available through the book's supporting website to help course instructors prepare their lectures." -- Publisher's description.}, address = {New York}, author = {Manning, Christopher D. and Raghavan, Prabhakar and Schütze, Hinrich}, interhash = {2e574e46b7668a7268e7f02b46f4d9bb}, intrahash = {9f4ab13e07b48b9723113aa74224be65}, isbn = {9780521865715 0521865719}, publisher = {Cambridge University Press}, title = {Introduction to Information Retrieval}, url = {http://www.amazon.com/Introduction-Information-Retrieval-Christopher-Manning/dp/0521865719/ref=sr_1_1?ie=UTF8&qid=1337379279&sr=8-1}, year = 2008 } @incollection{baezayates2010generation, address = {Berlin/Heidelberg}, affiliation = {Yahoo! Research, Barcelona, Spain & Sunnyvale, USA}, author = {Baeza-Yates, Ricardo and Raghavan, Prabhakar}, booktitle = {Search Computing}, chapter = 2, doi = {10.1007/978-3-642-12310-8_2}, editor = {Ceri, Stefano and Brambilla, Marco}, interhash = {f52c55156cf6e3e51a3f752e61cc4c3b}, intrahash = {f709078bbb502164a951921df95feb4c}, isbn = {978-3-642-12309-2}, keyword = {Computer Science}, pages = {11--23}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, title = {Next Generation Web Search}, url = {http://dx.doi.org/10.1007/978-3-642-12310-8_2}, volume = 5950, year = 2010 } @book{manning2008introduction, abstract = {"Class-tested and coherent, this textbook teaches classical and web information retrieval, including web search and the related areas of text classification and text clustering from basic concepts. It gives an up-to-date treatment of all aspects of the design and implementation of systems for gathering, indexing, and searching documents; methods for evaluating systems; and an introduction to the use of machine learning methods on text collections. All the important ideas are explained using examples and figures, making it perfect for introductory courses in information retrieval for advanced undergraduates and graduate students in computer science. Based on feedback from extensive classroom experience, the book has been carefully structured in order to make teaching more natural and effective. Slides and additional exercises (with solutions for lecturers) are also available through the book's supporting website to help course instructors prepare their lectures." -- Publisher's description.}, address = {New York}, author = {Manning, Christopher D. and Raghavan, Prabhakar and Schütze, Hinrich}, interhash = {2e574e46b7668a7268e7f02b46f4d9bb}, intrahash = {9f4ab13e07b48b9723113aa74224be65}, isbn = {9780521865715 0521865719}, publisher = {Cambridge University Press}, title = {Introduction to Information Retrieval}, url = {http://www.amazon.com/Introduction-Information-Retrieval-Christopher-Manning/dp/0521865719/ref=sr_1_1?ie=UTF8&qid=1337379279&sr=8-1}, year = 2008 } @article{Chakrabarti:1998:SFS:765529.765533, abstract = {We explore how to organize large text databases hierarchically by topic to aid better searching, browsing and filtering. Many corpora, such as internet directories, digital libraries, and patent databases are manually organized into topic hierarchies, also called taxonomies. Similar to indices for relational data, taxonomies make search and access more efficient. However, the exponential growth in the volume of on-line textual information makes it nearly impossible to maintain such taxonomic organization for large, fast-changing corpora by hand. We describe an automatic system that starts with a small sample of the corpus in which topics have been assigned by hand, and then updates the database with new documents as the corpus grows, assigning topics to these new documents with high speed and accuracy. To do this, we use techniques from statistical pattern recognition to efficiently separate the feature words, or discriminants, from thenoise words at each node of the taxonomy. Using these, we build a multilevel classifier. At each node, this classifier can ignore the large number of “noise” words in a document. Thus, the classifier has a small model size and is very fast. Owing to the use of context-sensitive features, the classifier is very accurate. As a by-product, we can compute for each document a set of terms that occur significantly more often in it than in the classes to which it belongs. We describe the design and implementation of our system, stressing how to exploit standard, efficient relational operations like sorts and joins. We report on experiences with the Reuters newswire benchmark, the US patent database, and web document samples from Yahoo!. We discuss applications where our system can improve searching and filtering capabilities.}, acmid = {765533}, address = {Secaucus, NJ, USA}, author = {Chakrabarti, Soumen and Dom, Byron and Agrawal, Rakesh and Raghavan, Prabhakar}, doi = {10.1007/s007780050061}, interhash = {5969518d9723da79c33437322c06474a}, intrahash = {606e518c01399c1e0569a00a81719343}, issn = {1066-8888}, issue = {3}, journal = {The VLDB Journal}, month = {August}, numpages = {16}, pages = {163--178}, publisher = {Springer-Verlag New York, Inc.}, title = {Scalable feature selection, classification and signature generation for organizing large text databases into hierarchical topic taxonomies}, url = {http://dx.doi.org/10.1007/s007780050061}, volume = 7, year = 1998 } @inproceedings{Gibson98clusteringcategorical, abstract = {We describe a novel approach for clustering collections of sets, and its application to the analysis and mining of categorical data. By "categorical data," we mean tables with fields that cannot be naturally ordered by a metric --- e.g., the names of producers of automobiles, or the names of products offered by a manufacturer. Our approach is based on an iterative method for assigning and propagating weights on the categorical values in a table; this facilitates a type of similarity measure arising from the cooccurrence of values in the dataset. Our techniques can be studied analytically in terms of certain types of non-linear dynamical systems. We discuss experiments on a variety of tables of synthetic and real data; we find that our iterative methods converge quickly to prominently correlated values of various categorical fields. 1 Introduction Much of the data in databases is categorical: fields in tables whose attributes cannot naturally be ordered as numerical values can. The pro...}, author = {Gibson, David and Kleinberg, Jon and Raghavan, Prabhakar}, interhash = {1439dc731dbc3225e455c4cd4ec297b1}, intrahash = {31bcdc070e056e9ba33ba155ebc9285d}, pages = {311--322}, title = {Clustering Categorical Data: An Approach Based on Dynamical Systems}, url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.43.8003}, year = 1998 } @inproceedings{citeulike:688160, address = {New York, NY, USA}, author = {Dubinko, Micah and Kumar, Ravi and Magnani, Joseph and Novak, Jasmine and Raghavan, Prabhakar and Tomkins, Andrew}, booktitle = {WWW '06: Proceedings of the 15th international conference on World Wide Web}, citeulike-article-id = {688160}, citeulike-linkout-0 = {http://portal.acm.org/citation.cfm?id=1135777.1135810}, citeulike-linkout-1 = {http://dx.doi.org/10.1145/1135777.1135810}, doi = {10.1145/1135777.1135810}, interhash = {b9ff2f72831a1406013a86c8202d6276}, intrahash = {cca8a679a78e2bced9a5cc268cfd3aaa}, isbn = {1595933239}, pages = {193--202}, posted-at = {2008-04-27 18:08:29}, priority = {5}, publisher = {ACM Press}, title = {Visualizing tags over time}, url = {http://dx.doi.org/10.1145/1135777.1135810}, year = 2006 } @article{kumar99trawling, author = {Kumar, Ravi and Raghavan, Prabhakar and Rajagopalan, Sridhar and Tomkins, Andrew}, interhash = {18f10df4843e07c2bcec1cd916aa5da0}, intrahash = {e1ce1b53c051efa1892dbc3b09643c31}, journal = {Computer Networks }, number = {11--16}, pages = {1481--1493}, title = {Trawling the {Web} for emerging cyber-communities}, url = {citeseer.ist.psu.edu/kumar99trawling.html}, volume = 31, year = 1999 } @proceedings{citeulike:108464, author = {Kumar, Ravi and Novak, Jasmine and Raghavan, Prabhakar and Tomkins, Andrew}, booktitle = {WWW '03: Proceedings of the twelfth international conference on World Wide Web}, citeulike-article-id = {108464}, doi = {10.1145/775152.775233}, interhash = {dd46fb4bce63023cf588c809b8602358}, intrahash = {2dfec9129c883d3a89ce6617dea21673}, isbn = {1581136803}, pages = {568--576}, priority = {2}, publisher = {ACM Press}, title = {On the bursty evolution of blogspace}, url = {http://portal.acm.org/citation.cfm?id=775233}, year = 2003 } @inproceedings{magnani_visualtags_2006, abstract = {We consider the problem of visualizing the evolution of tags within the Flickr (flickr.com) online image sharing community. Any user of the Flickr service may append a tag to any photo in the system. Over the past year, users have on average added over a million tags each week. Understanding the evolution of these tags over time is therefore a challenging task. We present a new approach based on a characterization of the most interesting tags associated with a sliding interval of time. An animation provided via Flash in a web browser allows the user to observe and interact with the interesting tags as they evolve over time.New algorithms and data structures are required to support the efficient generation of this visualization. We combine a novel solution to an interval covering problem with extensions to previous work on score aggregation in order to create an efficient backend system capable of producing visualizations at arbitrary scales on this large dataset in real time.}, address = {New York, NY, USA}, author = {Dubinko, Micah and Kumar, Ravi and Magnani, Joseph and Novak, Jasmine and Raghavan, Prabhakar and Tomkins, Andrew}, booktitle = {WWW '06: Proceedings of the 15th international conference on World Wide Web}, doi = {http://doi.acm.org/10.1145/1135777.1135810}, interhash = {b9ff2f72831a1406013a86c8202d6276}, intrahash = {cca8a679a78e2bced9a5cc268cfd3aaa}, isbn = {1-59593-323-9}, location = {Edinburgh, Scotland}, pages = {193--202}, publisher = {ACM Press}, title = {Visualizing tags over time}, url = {http://portal.acm.org/citation.cfm?id=1135810}, year = 2006 } @inproceedings{conf/ht/GibsonKR98, author = {Gibson, David and Kleinberg, Jon M. and Raghavan, Prabhakar}, booktitle = {Hypertext}, cdrom = {HT1998/P225.pdf}, ee = {db/conf/ht/GibsonKR98.html}, interhash = {47c85d35ba3293b0de52af32e824164b}, intrahash = {bdc4ed454bc2dd7194de0f5f0b451203}, pages = {225-234}, title = {Inferring Web Communities from Link Topology.}, url = {http://dblp.uni-trier.de/db/conf/ht/ht98.html#GibsonKR98}, year = 1998 } @article{gibson00clustering, author = {Gibson, David and Kleinberg, Jon M. and Raghavan, Prabhakar}, interhash = {8a2ea9a413538069328404c0bfe9c656}, intrahash = {5205d8f52b2a8f22b63dd40bdd99746a}, journal = {VLDB Journal: Very Large Data Bases}, number = {3--4}, pages = {222--236}, title = {Clustering Categorical Data: An Approach Based on Dynamical Systems}, url = {http://citeseer.ist.psu.edu/cache/papers/cs/157/http:zSzzSzcs.cornell.eduzSzInfozSzPeoplezSzkleinberzSzvldb98.pdf/gibson98clustering.pdf}, volume = 8, year = 2000 } @article{journals/cn/KumarRRT99, author = {Kumar, Ravi and Raghavan, Prabhakar and Rajagopalan, Sridhar and Tomkins, Andrew}, ee = {http://dx.doi.org/10.1016/S1389-1286(99)00040-7}, interhash = {18f10df4843e07c2bcec1cd916aa5da0}, intrahash = {b356c4c84555d3b4d013b773ab867c8c}, journal = {Computer Networks}, number = {11-16}, pages = {1481-1493}, title = {Trawling the Web for Emerging Cyber-Communities.}, url = {http://dblp.uni-trier.de/db/journals/cn/cn31.html#KumarRRT99}, volume = 31, year = 1999 } @article{1035162, address = {New York, NY, USA}, author = {Kumar, Ravi and Novak, Jasmine and Raghavan, Prabhakar and Tomkins, Andrew}, interhash = {1ac484110e3594aadeb1225b0c6cf413}, intrahash = {59276f12591314d721e8f408f8c341af}, issn = {0001-0782}, journal = {Commun. ACM}, number = 12, pages = {35--39}, publisher = {ACM Press}, title = {Structure and evolution of blogspace}, url = {http://doi.acm.org/10.1145/1035134.1035162}, volume = 47, year = 2004 }