@article{vorontsovtutorial, author = {Vorontsov, Konstantin and Potapenko, Anna}, interhash = {b3302a48be9b79342711884605ee3503}, intrahash = {12f451e98ef51ea1060565ab96e19e3c}, title = {Tutorial on Probabilistic Topic Modeling: Additive Regularization for Stochastic Matrix Factorization}, year = 2014 } @article{6542727, abstract = {In this paper, we propose a novel hierarchical generative model, named author-genre-topic model (AGTM), to perform satellite image annotation. Different from the existing author-topic model in which each author and topic are associated with the multinomial distributions over topics and words, in AGTM, each genre, author, and topic are associated with the multinomial distributions over authors, topics, and words, respectively. The bias of the distribution of the authors with respect to the topics can be rectified by incorporating the distribution of the genres with respect to the authors. Therefore, the classification accuracy of documents is improved when the information of genre is introduced. By representing the images with several visual words, the AGTM can be used for satellite image annotation. The labels of classes and scenes of the images correspond to the authors and the genres of the documents, respectively. The labels of classes and scenes of test images can be estimated, and the accuracy of satellite image annotation is improved when the information of scenes is introduced in the training images. Experimental results demonstrate the good performance of the proposed method.}, author = {Luo, Wang and Li, Hongliang and Liu, Guanghui and Zeng, Liaoyuan}, doi = {10.1109/TGRS.2013.2250978}, interhash = {4152c5c479a7eae90a4ee1f63dc89610}, intrahash = {a68906eb86024782ace5fe7a33d16522}, issn = {0196-2892}, journal = {Geoscience and Remote Sensing, IEEE Transactions on}, month = feb, number = 2, pages = {1356-1368}, title = {Semantic Annotation of Satellite Images Using Author - Genre - Topic Model}, url = {http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=6542727&abstractAccess=no&userType=inst}, volume = 52, year = 2014 } @article{kataria2011context, abstract = {In a document network such as a citation network of scientific documents, web-logs etc., the content produced by authors exhibit their interest in certain topics. In addition some authors influence other authors' interests. In this work, we propose to model the influence of cited authors along with the interests of citing authors. Morover , we hypothesize that citations present in documents, the context surrounding the citation mention provides extra topical information about the cited authors. However, associating terms in the context to the cited authors remains an open problem. We propose novel document generation schemes that incorporate the context while simultaneously modeling the interests of citing authors and influence of the cited authors. Our experiments show significant improvements over baseline models for various evaluation criteria such as link prediction between document and cited author, and quantitatively explaining unseen text.}, author = {Kataria, Saurabh and Mitra, Prasenjit and Caragea, Cornelia and Giles, C.}, conference = {International Joint Conference on Artificial Intelligence}, interhash = {7496b4df1335fbc6aea691cecb65289d}, intrahash = {dc774d17ec721be6d32530d265f34539}, title = {Context Sensitive Topic Models for Author Influence in Document Networks}, url = {https://www.aaai.org/ocs/index.php/IJCAI/IJCAI11/paper/view/3140}, year = 2011 } @inproceedings{conf/ai/ShafieiM08, author = {Shafiei, M. Mahdi and Milios, Evangelos E.}, booktitle = {Canadian Conference on AI}, crossref = {conf/ai/2008}, editor = {Bergler, Sabine}, ee = {http://dx.doi.org/10.1007/978-3-540-68825-9_27}, interhash = {1ed1fddf0ac4762ea8debac2ee80b936}, intrahash = {80e27cd4ea288b0ab6bcc1c67841364e}, isbn = {978-3-540-68821-1}, pages = {283-295}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, title = {A Statistical Model for Topic Segmentation and Clustering.}, url = {http://dblp.uni-trier.de/db/conf/ai/ai2008.html#ShafieiM08}, volume = 5032, year = 2008 } @article{journals/ml/DuBJ10, author = {Du, Lan and Buntine, Wray L. and Jin, Huidong}, ee = {http://dx.doi.org/10.1007/s10994-010-5197-4}, interhash = {f39304f04fa411cc2c9232aa7eb83b83}, intrahash = {286291dfe97008c5bda330ffc0b72af1}, journal = {Machine Learning}, number = 1, pages = {5-19}, title = {A segmented topic model based on the two-parameter Poisson-Dirichlet process.}, url = {http://dblp.uni-trier.de/db/journals/ml/ml81.html#DuBJ10}, volume = 81, year = 2010 } @inproceedings{Ramage:2009:LLS:1699510.1699543, abstract = {A significant portion of the world's text is tagged by readers on social bookmarking websites. Credit attribution is an inherent problem in these corpora because most pages have multiple tags, but the tags do not always apply with equal specificity across the whole document. Solving the credit attribution problem requires associating each word in a document with the most appropriate tags and vice versa. This paper introduces Labeled LDA, a topic model that constrains Latent Dirichlet Allocation by defining a one-to-one correspondence between LDA's latent topics and user tags. This allows Labeled LDA to directly learn word-tag correspondences. We demonstrate Labeled LDA's improved expressiveness over traditional LDA with visualizations of a corpus of tagged web pages from del.icio.us. Labeled LDA outperforms SVMs by more than 3 to 1 when extracting tag-specific document snippets. As a multi-label text classifier, our model is competitive with a discriminative baseline on a variety of datasets.}, acmid = {1699543}, address = {Stroudsburg, PA, USA}, author = {Ramage, Daniel and Hall, David and Nallapati, Ramesh and Manning, Christopher D.}, booktitle = {Proceedings of the 2009 Conference on Empirical Methods in Natural Language Processing: Volume 1 - Volume 1}, interhash = {45315f4da7b10debdca560506cf0d7ba}, intrahash = {6e7173f084e26bca9a8d2a1ab4a5b709}, isbn = {978-1-932432-59-6}, location = {Singapore}, numpages = {9}, pages = {248--256}, publisher = {Association for Computational Linguistics}, series = {EMNLP '09}, title = {Labeled LDA: A Supervised Topic Model for Credit Attribution in Multi-labeled Corpora}, url = {http://dl.acm.org/citation.cfm?id=1699510.1699543}, year = 2009 } @inproceedings{conf/pkdd/BalasubramanyanDC13, author = {Balasubramanyan, Ramnath and Dalvi, Bhavana Bharat and Cohen, William W.}, booktitle = {ECML/PKDD (2)}, crossref = {conf/pkdd/2013-2}, editor = {Blockeel, Hendrik and Kersting, Kristian and Nijssen, Siegfried and Zelezný, Filip}, ee = {http://dx.doi.org/10.1007/978-3-642-40991-2_40}, interhash = {9a32b7cc059a500ea302d0aa65036682}, intrahash = {e56623d21a1b7bcb442cd15fe098bb70}, isbn = {978-3-642-40990-5}, pages = {628-642}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, title = {From Topic Models to Semi-supervised Learning: Biasing Mixed-Membership Models to Exploit Topic-Indicative Features in Entity Clustering.}, url = {http://dblp.uni-trier.de/db/conf/pkdd/pkdd2013-2.html#BalasubramanyanDC13}, volume = 8189, year = 2013 } @inproceedings{conf/kdd/HongYGD11, author = {Hong, Liangjie and Yin, Dawei and 0002, Jian Guo and 0001, Brian D. Davison}, booktitle = {KDD}, crossref = {conf/kdd/2011}, editor = {Apté, Chid and Ghosh, Joydeep and Smyth, Padhraic}, ee = {http://doi.acm.org/10.1145/2020408.2020485}, interhash = {35519287a72896f1adee0aaf14430dd8}, intrahash = {a636ba59e9c57611c070e30086b27592}, isbn = {978-1-4503-0813-7}, pages = {484-492}, publisher = {ACM}, title = {Tracking trends: incorporating term volume into temporal topic models.}, url = {http://dblp.uni-trier.de/db/conf/kdd/kdd2011.html#HongYGD11}, year = 2011 } @incollection{bleigjt03, address = {Cambridge, MA}, author = {Blei, D. M. and Griffiths, T. L. and Jordan, M. I. and Tenenbaum, J. B.}, booktitle = {Advances in {N}eural {I}nformation {P}rocessing {S}ystems 16}, interhash = {f185b4657e25c733ee613bece516b3c5}, intrahash = {3e438204424fa2c6e8915bd8f0baf112}, publisher = {MIT Press}, title = {Hierarchical topic models and the nested {C}hinese restaurant process}, year = 2004 } @misc{kang2013lalda, abstract = {Social media users have finite attention which limits the number of incoming messages from friends they can process. Moreover, they pay more attention to opinions and recommendations of some friends more than others. In this paper, we propose LA-LDA, a latent topic model which incorporates limited, non-uniformly divided attention in the diffusion process by which opinions and information spread on the social network. We show that our proposed model is able to learn more accurate user models from users' social network and item adoption behavior than models which do not take limited attention into account. We analyze voting on news items on the social news aggregator Digg and show that our proposed model is better able to predict held out votes than alternative models. Our study demonstrates that psycho-socially motivated models have better ability to describe and predict observed behavior than models which only consider topics.}, author = {Kang, Jeon-Hyung and Lerman, Kristina and Getoor, Lise}, interhash = {18a900ae003a2aedb3879fcaaa4e89b6}, intrahash = {84ae222ddb615ca8ae9421a29c07a8f6}, note = {cite arxiv:1301.6277Comment: The 2013 International Conference on Social Computing, Behavioral-Cultural Modeling, & Prediction (SBP 2013)}, title = {LA-LDA: A Limited Attention Topic Model for Social Recommendation}, url = {http://arxiv.org/abs/1301.6277}, year = 2013 } @misc{goldenberg2009survey, abstract = {Networks are ubiquitous in science and have become a focal point for discussion in everyday life. Formal statistical models for the analysis of network data have emerged as a major topic of interest in diverse areas of study, and most of these involve a form of graphical representation. Probability models on graphs date back to 1959. Along with empirical studies in social psychology and sociology from the 1960s, these early works generated an active network community and a substantial literature in the 1970s. This effort moved into the statistical literature in the late 1970s and 1980s, and the past decade has seen a burgeoning network literature in statistical physics and computer science. The growth of the World Wide Web and the emergence of online networking communities such as Facebook, MySpace, and LinkedIn, and a host of more specialized professional network communities has intensified interest in the study of networks and network data. Our goal in this review is to provide the reader with an entry point to this burgeoning literature. We begin with an overview of the historical development of statistical network modeling and then we introduce a number of examples that have been studied in the network literature. Our subsequent discussion focuses on a number of prominent static and dynamic network models and their interconnections. We emphasize formal model descriptions, and pay special attention to the interpretation of parameters and their estimation. We end with a description of some open problems and challenges for machine learning and statistics.}, author = {Goldenberg, Anna and Zheng, Alice X and Fienberg, Stephen E and Airoldi, Edoardo M}, interhash = {bab22de06306d84cf357aadf48982d87}, intrahash = {5e341981218d7cd89416c3371d56c794}, note = {cite arxiv:0912.5410Comment: 96 pages, 14 figures, 333 references}, title = {A survey of statistical network models}, url = {http://arxiv.org/abs/0912.5410}, year = 2009 } @inproceedings{journals/jmlr/ChangB09, author = {Chang, Jonathan and Blei, David M.}, booktitle = {AISTATS}, crossref = {conf/aistats/2009}, editor = {Dyk, David A. Van and Welling, Max}, ee = {http://www.jmlr.org/proceedings/papers/v5/chang09a.html}, interhash = {f3431fd69b315a22422a2c0f15ee0b71}, intrahash = {86f665b74ecabb56e81542e0f052a331}, pages = {81-88}, publisher = {JMLR.org}, series = {JMLR Proceedings}, title = {Relational Topic Models for Document Networks.}, url = {http://dblp.uni-trier.de/db/journals/jmlr/jmlrp5.html#ChangB09}, volume = 5, year = 2009 } @misc{lan2013joint, abstract = {Modern machine learning methods are critical to the development of large-scale personalized learning systems that cater directly to the needs of individual learners. The recently developed SPARse Factor Analysis (SPARFA) framework provides a new statistical model and algorithms for machine learning-based learning analytics, which estimate a learner's knowledge of the latent concepts underlying a domain, and content analytics, which estimate the relationships among a collection of questions and the latent concepts. SPARFA estimates these quantities given only the binary-valued graded responses to a collection of questions. In order to better interpret the estimated latent concepts, SPARFA relies on a post-processing step that utilizes user-defined tags (e.g., topics or keywords) available for each question. In this paper, we relax the need for user-defined tags by extending SPARFA to jointly process both graded learner responses and the text of each question and its associated answer(s) or other feedback. Our purely data-driven approach (i) enhances the interpretability of the estimated latent concepts without the need of explicitly generating a set of tags or performing a post-processing step, (ii) improves the prediction performance of SPARFA, and (iii) scales to large test/assessments where human annotation would prove burdensome. We demonstrate the efficacy of the proposed approach on two real educational datasets.}, author = {Lan, Andrew S. and Studer, Christoph and Waters, Andrew E. and Baraniuk, Richard G.}, interhash = {911707523671c994e5c3fe63c3df5c4a}, intrahash = {2a8df43258181ed85e5d43b489fd45fb}, note = {cite arxiv:1305.1956}, title = {Joint Topic Modeling and Factor Analysis of Textual Information and Graded Response Data}, url = {http://arxiv.org/abs/1305.1956}, year = 2013 } @techreport{ilprints750, abstract = {The original PageRank algorithm for improving the ranking of search-query results computes a single vector, using the link structure of the Web, to capture the relative ``importance'' of Web pages, independent of any particular search query. To yield more accurate search results, we propose computing a {\em set} of PageRank vectors, biased using a set of representative topics, to capture more accurately the notion of importance with respect to a particular topic. For ordinary keyword search queries, we compute the topic-sensitive PageRank scores for pages satisfying the query using the topic of the query keywords. For searches done in context (e.g., when the search query is performed by highlighting words in a Web page), we compute the topic-sensitive PageRank scores using the topic of the context in which the query appeared. By using linear combinations of these (precomputed) biased PageRank vectors to generate context-specific importance scores for pages at query time, we show that we can generate more accurate rankings than with a single, generic PageRank vector. }, author = {Haveliwala, Taher H.}, institution = {Stanford InfoLab}, interhash = {959ab9af6c35acb5d8513fa032620ba7}, intrahash = {34aedd24fc7a45f189be1ca70dfd99e2}, journal = {IEEE Transactions on Knowledge and Data Engineering}, note = {Extended version of the WWW2002 paper on Topic-Sensitive PageRank.}, number = {2003-29}, publisher = {Stanford InfoLab}, title = {Topic-Sensitive PageRank: A Context-Sensitive Ranking Algorithm for Web Search}, type = {Technical Report}, url = {http://ilpubs.stanford.edu:8090/750/}, year = 2003 } @misc{Rubin2011, abstract = { Machine learning approaches to multi-label document classification have (to date) largely relied on discriminative modeling techniques such as support vector machines. A drawback of these approaches is that performance rapidly drops off as the total number of labels and the number of labels per document increase. This problem is amplified when the label frequencies exhibit the type of highly skewed distributions that are often observed in real-world datasets. In this paper we investigate a class of generative statistical topic models for multi-label documents that associate individual word tokens with different labels. We investigate the advantages of this approach relative to discriminative models, particularly with respect to classification problems involving large numbers of relatively rare labels. We compare the performance of generative and discriminative approaches on document labeling tasks ranging from datasets with several thousand labels to datasets with tens of labels. The experimental results indicate that generative models can achieve competitive multi-label classification performance compared to discriminative methods, and have advantages for datasets with many labels and skewed label frequencies. }, author = {Rubin, Timothy N. and Chambers, America and Smyth, Padhraic and Steyvers, Mark}, interhash = {e09d5d8587756d460a5d834025e75aac}, intrahash = {f8a5a3958ae264d19c7f5415eb7f0bce}, note = {cite arxiv:1107.2462}, title = {Statistical Topic Models for Multi-Label Document Classification}, url = {http://arxiv.org/abs/1107.2462}, year = 2011 } @inproceedings{1661779, abstract = {A folksonomy refers to a collection of user-defined tags with which users describe contents published on the Web. With the flourish of Web 2.0, folksonomies have become an important mean to develop the Semantic Web. Because tags in folksonomies are authored freely, there is a need to understand the structure and semantics of these tags in various applications. In this paper, we propose a learning approach to create an ontology that captures the hierarchical semantic structure of folksonomies. Our experimental results on two different genres of real world data sets show that our method can effectively learn the ontology structure from the folksonomies.}, address = {San Francisco, CA, USA}, author = {Tang, Jie and fung Leung, Ho and Luo, Qiong and Chen, Dewei and Gong, Jibin}, booktitle = {IJCAI'09: Proceedings of the 21st international jont conference on Artifical intelligence}, interhash = {17f95a6ba585888cf45443926d8b7e98}, intrahash = {7b335f08a288a79eb70eff89f1ec7630}, location = {Pasadena, California, USA}, pages = {2089--2094}, publisher = {Morgan Kaufmann Publishers Inc.}, title = {Towards ontology learning from folksonomies}, url = {http://ijcai.org/papers09/Papers/IJCAI09-344.pdf}, year = 2009 } @article{carpena:035102, author = {Carpena, P. and Bernaola-Galv\'{a}n, P. and Hackenberg, M. and Coronado, A. V. and Oliver, J. L.}, doi = {10.1103/PhysRevE.79.035102}, eid = {035102}, interhash = {3444159872c65ea89d007d1838686acc}, intrahash = {34dcb1eee3ffa31ff4eb77087343c146}, journal = {Physical Review E (Statistical, Nonlinear, and Soft Matter Physics)}, number = 3, numpages = {4}, pages = 035102, publisher = {APS}, title = {Level statistics of words: Finding keywords in literary texts and symbolic sequences}, url = {http://bioinfo2.ugr.es/TextKeywords/}, volume = 79, year = 2009 } @inproceedings{IfrimTW-ICML2005, address = {Bonn, Germany}, author = {Ifrim, Georgiana and Theobald, Martin and Weikum, Gerhard}, booktitle = {Proceedings of the 22nd International Conference on Machine Learning - Learning in Web Search (LWS 2005)}, editor = {Raedt, Luc De and Wrobel, Stefan}, interhash = {a54c4070e0fb55f5a084a0f088230a65}, intrahash = {57f8241941ed979455c3dbb90893020f}, isbn = {1-59593-180-5}, pages = {18--26}, title = {Learning Word-to-Concept Mappings for Automatic Text Classification}, url = {http://www.mpi-inf.mpg.de/~ifrim/publications/icml-lws05.pdf}, year = 2005 } @inproceedings{haveliwala02topicsensitive, address = {Honolulu, Hawaii}, author = {Haveliwala, Taher H.}, booktitle = {Proceedings of the Eleventh International World Wide Web Conference}, interhash = {29a20afd5026732686509987f603d33d}, intrahash = {c056611effc0d18aae71a6d535ff6c5a}, month = May, title = {Topic-sensitive PageRank}, url = {http://citeseer.csail.mit.edu/haveliwala02topicsensitive.html}, year = 2002 } @article{Chakrabartietal99, author = {Chakrabarti, S. and van den Berg, M. and Dom, B.}, interhash = {e35ac8e9c02ab2a5075b9c1692ac7a2d}, intrahash = {004dd97a2b2e71fa2cfe6820c74c9701}, isbn = {90-74821-43-X}, journal = {Computer Networks}, pages = {1623--1640}, title = {Focused Crawling: A New Approach to Topic-Specific Web Resource Discovery}, url = {citeseer.nj.nec.com/chakrabarti99focused.html}, volume = 31, year = 1999 }