@inproceedings{Ramage:2009:LLS:1699510.1699543, abstract = {A significant portion of the world's text is tagged by readers on social bookmarking websites. Credit attribution is an inherent problem in these corpora because most pages have multiple tags, but the tags do not always apply with equal specificity across the whole document. Solving the credit attribution problem requires associating each word in a document with the most appropriate tags and vice versa. This paper introduces Labeled LDA, a topic model that constrains Latent Dirichlet Allocation by defining a one-to-one correspondence between LDA's latent topics and user tags. This allows Labeled LDA to directly learn word-tag correspondences. We demonstrate Labeled LDA's improved expressiveness over traditional LDA with visualizations of a corpus of tagged web pages from del.icio.us. Labeled LDA outperforms SVMs by more than 3 to 1 when extracting tag-specific document snippets. As a multi-label text classifier, our model is competitive with a discriminative baseline on a variety of datasets.}, acmid = {1699543}, address = {Stroudsburg, PA, USA}, author = {Ramage, Daniel and Hall, David and Nallapati, Ramesh and Manning, Christopher D.}, booktitle = {Proceedings of the 2009 Conference on Empirical Methods in Natural Language Processing: Volume 1 - Volume 1}, interhash = {45315f4da7b10debdca560506cf0d7ba}, intrahash = {6e7173f084e26bca9a8d2a1ab4a5b709}, isbn = {978-1-932432-59-6}, location = {Singapore}, numpages = {9}, pages = {248--256}, publisher = {Association for Computational Linguistics}, series = {EMNLP '09}, title = {Labeled LDA: A Supervised Topic Model for Credit Attribution in Multi-labeled Corpora}, url = {http://dl.acm.org/citation.cfm?id=1699510.1699543}, year = 2009 } @inproceedings{Yeh:2009:WRW:1708124.1708133, abstract = {Computing semantic relatedness of natural language texts is a key component of tasks such as information retrieval and summarization, and often depends on knowledge of a broad range of real-world concepts and relationships. We address this knowledge integration issue by computing semantic relatedness using personalized PageRank (random walks) on a graph derived from Wikipedia. This paper evaluates methods for building the graph, including link selection strategies, and two methods for representing input texts as distributions over the graph nodes: one based on a dictionary lookup, the other based on Explicit Semantic Analysis. We evaluate our techniques on standard word relatedness and text similarity datasets, finding that they capture similarity information complementary to existing Wikipedia-based relatedness measures, resulting in small improvements on a state-of-the-art measure.}, acmid = {1708133}, address = {Stroudsburg, PA, USA}, author = {Yeh, Eric and Ramage, Daniel and Manning, Christopher D. and Agirre, Eneko and Soroa, Aitor}, booktitle = {Proceedings of the 2009 Workshop on Graph-based Methods for Natural Language Processing}, interhash = {8b28cd800b6ad3929eef3b45de997e51}, intrahash = {ffd20a7357ca8e87d46e516589a7769e}, isbn = {978-1-932432-54-1}, location = {Suntec, Singapore}, numpages = {9}, pages = {41--49}, publisher = {Association for Computational Linguistics}, series = {TextGraphs-4}, title = {WikiWalk: random walks on Wikipedia for semantic relatedness}, url = {http://dl.acm.org/citation.cfm?id=1708124.1708133}, year = 2009 } @inproceedings{ramage2009clustering, abstract = {Automatically clustering web pages into semantic groups promises improved search and browsing on the web. In this paper, we demonstrate how user-generated tags from largescale social bookmarking websites such as del.icio.us can be used as a complementary data source to page text and anchor text for improving automatic clustering of web pages. This paper explores the use of tags in 1) K-means clustering in an extended vector space model that includes tags as well as page text and 2) a novel generative clustering algorithm based on latent Dirichlet allocation that jointly models text and tags. We evaluate the models by comparing their output to an established web directory. We find that the naive inclusion of tagging data improves cluster quality versus page text alone, but a more principled inclusion can substantially improve the quality of all models with a statistically significant absolute F-score increase of 4%. The generative model outperforms K-means with another 8% F-score increase.}, address = {New York, NY, USA}, author = {Ramage, Daniel and Heymann, Paul and Manning, Christopher D. and Garcia-Molina, Hector}, booktitle = {WSDM '09: Proceedings of the Second ACM International Conference on Web Search and Data Mining}, doi = {http://doi.acm.org/10.1145/1498759.1498809}, file = {ramage2009clustering.pdf:ramage2009clustering.pdf:PDF}, groups = {public}, interhash = {5595f06f88310ed67fd6fe23f813c69b}, intrahash = {75c4bad29d7eb4b34f68da27f0353516}, isbn = {978-1-60558-390-7}, location = {Barcelona, Spain}, pages = {54--63}, publisher = {ACM}, timestamp = {2009-04-24 10:19:45}, title = {Clustering the tagged web}, url = {http://portal.acm.org/citation.cfm?id=1498809}, username = {dbenz}, year = 2009 } @inproceedings{heymann2008social, abstract = {In this paper, we look at the "social tag prediction" problem. Given a set of objects, and a set of tags applied to those objects by users, can we predict whether a given tag could/should be applied to a particular object? We investigated this question using one of the largest crawls of the social bookmarking system del.icio.us gathered to date. For URLs in del.icio.us, we predicted tags based on page text, anchor text, surrounding hosts, and other tags applied to the URL. We found an entropy-based metric which captures the generality of a particular tag and informs an analysis of how well that tag can be predicted. We also found that tag-based association rules can produce very high-precision predictions as well as giving deeper understanding into the relationships between tags. Our results have implications for both the study of tagging systems as potential information retrieval tools, and for the design of such systems.}, address = {New York, NY, USA}, author = {Heymann, Paul and Ramage, Daniel and Garcia-Molina, Hector}, booktitle = {SIGIR '08: Proceedings of the 31st Annual International ACM SIGIR Conference on Research and Development in Information Retrieval}, doi = {http://doi.acm.org/10.1145/1390334.1390425}, interhash = {bb9455c80cc9bd8cf95c951a1318dabc}, intrahash = {0e6023e192f539fe4fce9894b1fbca5a}, isbn = {978-1-60558-164-4}, location = {Singapore, Singapore}, pages = {531--538}, publisher = {ACM}, title = {Social tag prediction}, url = {http://portal.acm.org/citation.cfm?id=1390334.1390425}, year = 2008 } @inproceedings{heymann2008social, abstract = {In this paper, we look at the "social tag prediction" problem. Given a set of objects, and a set of tags applied to those objects by users, can we predict whether a given tag could/should be applied to a particular object? We investigated this question using one of the largest crawls of the social bookmarking system del.icio.us gathered to date. For URLs in del.icio.us, we predicted tags based on page text, anchor text, surrounding hosts, and other tags applied to the URL. We found an entropy-based metric which captures the generality of a particular tag and informs an analysis of how well that tag can be predicted. We also found that tag-based association rules can produce very high-precision predictions as well as giving deeper understanding into the relationships between tags. Our results have implications for both the study of tagging systems as potential information retrieval tools, and for the design of such systems.}, address = {New York, NY, USA}, author = {Heymann, Paul and Ramage, Daniel and Garcia-Molina, Hector}, booktitle = {SIGIR '08: Proceedings of the 31st Annual International ACM SIGIR Conference on Research and Development in Information Retrieval}, doi = {http://doi.acm.org/10.1145/1390334.1390425}, interhash = {bb9455c80cc9bd8cf95c951a1318dabc}, intrahash = {0e6023e192f539fe4fce9894b1fbca5a}, isbn = {978-1-60558-164-4}, location = {Singapore, Singapore}, pages = {531--538}, publisher = {ACM}, title = {Social tag prediction}, url = {http://portal.acm.org/citation.cfm?id=1390334.1390425}, year = 2008 }