@inproceedings{noauthororeditor, author = {Mirowski, Piotr and Ranzato, Marc'Aurelio and LeCun, Yann}, editor = {of the NIPS 2010 Workshop on Deep Learning, Proceedings}, interhash = {b7ce347e904a4ca3263cf6cc1e2253bd}, intrahash = {fc3e0e3af595f9a46df6bc9233df836f}, title = {Dynamic Auto-Encoders for Semantic Indexing}, url = {http://yann.lecun.com/exdb/publis/pdf/mirowski-nipsdl-10.pdf}, year = 2010 } @article{vorontsovtutorial, author = {Vorontsov, Konstantin and Potapenko, Anna}, interhash = {b3302a48be9b79342711884605ee3503}, intrahash = {12f451e98ef51ea1060565ab96e19e3c}, title = {Tutorial on Probabilistic Topic Modeling: Additive Regularization for Stochastic Matrix Factorization}, year = 2014 } @inproceedings{Ramage:2009:LLS:1699510.1699543, abstract = {A significant portion of the world's text is tagged by readers on social bookmarking websites. Credit attribution is an inherent problem in these corpora because most pages have multiple tags, but the tags do not always apply with equal specificity across the whole document. Solving the credit attribution problem requires associating each word in a document with the most appropriate tags and vice versa. This paper introduces Labeled LDA, a topic model that constrains Latent Dirichlet Allocation by defining a one-to-one correspondence between LDA's latent topics and user tags. This allows Labeled LDA to directly learn word-tag correspondences. We demonstrate Labeled LDA's improved expressiveness over traditional LDA with visualizations of a corpus of tagged web pages from del.icio.us. Labeled LDA outperforms SVMs by more than 3 to 1 when extracting tag-specific document snippets. As a multi-label text classifier, our model is competitive with a discriminative baseline on a variety of datasets.}, acmid = {1699543}, address = {Stroudsburg, PA, USA}, author = {Ramage, Daniel and Hall, David and Nallapati, Ramesh and Manning, Christopher D.}, booktitle = {Proceedings of the 2009 Conference on Empirical Methods in Natural Language Processing: Volume 1 - Volume 1}, interhash = {45315f4da7b10debdca560506cf0d7ba}, intrahash = {6e7173f084e26bca9a8d2a1ab4a5b709}, isbn = {978-1-932432-59-6}, location = {Singapore}, numpages = {9}, pages = {248--256}, publisher = {Association for Computational Linguistics}, series = {EMNLP '09}, title = {Labeled LDA: A Supervised Topic Model for Credit Attribution in Multi-labeled Corpora}, url = {http://dl.acm.org/citation.cfm?id=1699510.1699543}, year = 2009 } @misc{kang2013lalda, abstract = {Social media users have finite attention which limits the number of incoming messages from friends they can process. Moreover, they pay more attention to opinions and recommendations of some friends more than others. In this paper, we propose LA-LDA, a latent topic model which incorporates limited, non-uniformly divided attention in the diffusion process by which opinions and information spread on the social network. We show that our proposed model is able to learn more accurate user models from users' social network and item adoption behavior than models which do not take limited attention into account. We analyze voting on news items on the social news aggregator Digg and show that our proposed model is better able to predict held out votes than alternative models. Our study demonstrates that psycho-socially motivated models have better ability to describe and predict observed behavior than models which only consider topics.}, author = {Kang, Jeon-Hyung and Lerman, Kristina and Getoor, Lise}, interhash = {18a900ae003a2aedb3879fcaaa4e89b6}, intrahash = {84ae222ddb615ca8ae9421a29c07a8f6}, note = {cite arxiv:1301.6277Comment: The 2013 International Conference on Social Computing, Behavioral-Cultural Modeling, & Prediction (SBP 2013)}, title = {LA-LDA: A Limited Attention Topic Model for Social Recommendation}, url = {http://arxiv.org/abs/1301.6277}, year = 2013 } @misc{titov2008modeling, abstract = {In this paper we present a novel framework for extracting the ratable aspects of objects from online user reviews. Extracting such aspects is an important challenge in automatically mining product opinions from the web and in generating opinion-based summaries of user reviews. Our models are based on extensions to standard topic modeling methods such as LDA and PLSA to induce multi-grain topics. We argue that multi-grain models are more appropriate for our task since standard models tend to produce topics that correspond to global properties of objects (e.g., the brand of a product type) rather than the aspects of an object that tend to be rated by a user. The models we present not only extract ratable aspects, but also cluster them into coherent topics, e.g., `waitress' and `bartender' are part of the same topic `staff' for restaurants. This differentiates it from much of the previous work which extracts aspects through term frequency analysis with minimal clustering. We evaluate the multi-grain models both qualitatively and quantitatively to show that they improve significantly upon standard topic models.}, author = {Titov, Ivan and McDonald, Ryan}, interhash = {00cbf1df09c3f2c65d5a31a0537aed3f}, intrahash = {f3286f5efa0115f465563d0259c32255}, note = {cite arxiv:0801.1063}, title = {Modeling Online Reviews with Multi-grain Topic Models}, url = {http://arxiv.org/abs/0801.1063}, year = 2008 } @article{Zhang20125759, abstract = {Social tagging is one of the most important ways to organize and index online resources. Recommendation in social tagging systems, e.g. tag recommendation, item recommendation and user recommendation, is used to improve the quality of tags and to ease the tagging or searching process. Existing works usually provide recommendations by analyzing relation information in social tagging systems, suffering a lot from the over sparse problem. These approaches ignore information contained in the content of resources, which we believe should be considered to improve recommendation quality and to deal with the over sparse problem. In this paper we propose a recommendation approach for social tagging systems that combines content and relation analysis in a single model. By modeling the generating process of social tagging systems in a latent Dirichlet allocation approach, we build a fully generative model for social tagging, leverage it to estimate the relation between users, tags and resources and achieve tag, item and user recommendation tasks. The model is evaluated using a CiteULike data snapshot, and results show improvements in metrics for various recommendation tasks.}, author = {Zhang, Yin and Zhang, Bin and Gao, Kening and Guo, Pengwei and Sun, Daming}, doi = {10.1016/j.physa.2012.05.013}, interhash = {088ad59c786579d399aaee48db5e6a7a}, intrahash = {84f824839090a5e20394b85a9e1cef08}, issn = {0378-4371}, journal = {Physica A: Statistical Mechanics and its Applications}, number = 22, pages = {5759 - 5768}, title = {Combining content and relation analysis for recommendation in social tagging systems}, url = {http://www.sciencedirect.com/science/article/pii/S0378437112003846}, volume = 391, year = 2012 } @article{Blei+Ng+Jordan:03a, author = {Blei, David M. and Ng, Andrew Y. and Jordan, Michael I.}, interhash = {9d1b808272b9e511425cbf557571e59a}, intrahash = {cbccc8032ce2763326dbe5de19c58aaf}, journal = {Journal of Machine Learning Research}, pages = {993--1022}, title = {Latent {Dirichlet} Allocation}, volume = 3, year = 2003 } @techreport{wang2008lda, author = {Wang, Yi}, interhash = {4c363a5b79efeee4e46a15cf7f85deac}, intrahash = {08b13d141b85fc0002bd6ec6cab9d18f}, title = {Distributed Gibbs Sampling of Latent Topic Models: The Gritty Details}, year = 2008 } @article{heinrich2005parameter, author = {Heinrich, G.}, interhash = {dbfae9d80facacc023f9a057930699ec}, intrahash = {53342b5f95d19b270d200603d53ec18a}, journal = {Web: http://www. arbylon. net/publications/text-est. pdf}, title = {{Parameter estimation for text analysis}}, url = {http://scholar.google.de/scholar.bib?q=info:oe4R2fGvQaMJ:scholar.google.com/&output=citation&hl=de&ct=citation&cd=0}, year = 2005 } @article{pu2009latent, abstract = {Co-clustering has emerged as an important technique for mining contingency data matrices. However, almost all existing co-clustering algorithms are hard partitioning, assigning each row and column of the data matrix to one cluster. Recently a Bayesian co-clusteringapproach has been proposed which allows a probability distribution membership in row and column clusters. The approach usesvariational inference for parameter estimation. In this work, we modify the Bayesian co-clustering model, and use collapsedGibbs sampling and collapsed variational inference for parameter estimation. Our empirical evaluation on real data sets showsthat both collapsed Gibbs sampling and collapsed variational inference are able to find more accurate likelihood estimatesthan the standard variational Bayesian co-clustering approach.}, author = {Wang, Pu and Domeniconi, Carlotta and Laskey, Kathryn}, interhash = {ca3c6ea6255fd4fa4601502fd55bec24}, intrahash = {0ef1833cdcdf2a7d9093e37894c4f3ab}, journal = {Machine Learning and Knowledge Discovery in Databases}, pages = {522--537}, title = {Latent Dirichlet Bayesian Co-Clustering}, url = {http://dx.doi.org/10.1007/978-3-642-04174-7_34}, year = 2009 } @inproceedings{conf/icdm/ShanB08, author = {Shan, Hanhuai and Banerjee, Arindam}, booktitle = {ICDM}, crossref = {conf/icdm/2008}, date = {2009-02-20}, ee = {http://dx.doi.org/10.1109/ICDM.2008.91}, interhash = {15f1d5cfb6898d44f170ae51a1f172ef}, intrahash = {543b31ac1f6f8b70b94976abb95e73c7}, pages = {530-539}, publisher = {IEEE Computer Society}, title = {Bayesian Co-clustering.}, url = {http://dblp.uni-trier.de/db/conf/icdm/icdm2008.html#ShanB08}, year = 2008 } @article{istván2009latent, abstract = {In this paper we introduce and evaluate a technique for applying latent Dirichlet allocation to supervised semantic categorization of documents. In our setup, for every category an own collection of topics is assigned, and for a labeled training documentonly topics from its category are sampled. Thus, compared to the classical LDA that processes the entire corpus in one, weessentially build separate LDA models for each category with the category-specific topics, and then these topic collectionsare put together to form a unified LDA model. For an unseen document the inferred topic distribution gives an estimation howmuch the document fits into the category.}, author = {Bíró, István and Szabó, Jácint}, interhash = {f4c5b12409be4108320cba5b8fd18c45}, intrahash = {2db7477d992284eabea47e1c9669ab5a}, journal = {Machine Learning and Knowledge Discovery in Databases}, pages = {430--441}, title = {Latent Dirichlet Allocation for Automatic Document Categorization}, url = {http://dx.doi.org/10.1007/978-3-642-04174-7_28}, year = 2009 } @article{loulwah2009topic, abstract = {Topic models, like Latent Dirichlet Allocation (LDA), have been recently used to automatically generate text corpora topics, and to subdivide the corpus words among those topics. However, not all the estimated topics are of equal importance or correspondto genuine themes of the domain. Some of the topics can be a collection of irrelevant words, or represent insignificant themes.Current approaches to topic modeling perform manual examination to find meaningful topics. This paper presents the first automatedunsupervised analysis of LDA models to identify junk topics from legitimate ones, and to rank the topic significance. Basically,the distance between a topic distribution and three definitions of “junk distribution” is computed using a variety of measures,from which an expressive figure of the topic significance is implemented using 4-phase Weighted Combination approach. Ourexperiments on synthetic and benchmark datasets show the effectiveness of the proposed approach in ranking the topic significance.}, author = {AlSumait, Loulwah and Barbará, Daniel and Gentle, James and Domeniconi, Carlotta}, interhash = {273b61715108282ac89350ba18f99eb2}, intrahash = {6310cb442c4e7852070e4f631fa2c1fa}, journal = {Machine Learning and Knowledge Discovery in Databases}, pages = {67--82}, title = {Topic Significance Ranking of LDA Generative Models}, url = {http://dx.doi.org/10.1007/978-3-642-04180-8_22}, year = 2009 } @inproceedings{conf/icdm/AlSumaitBD08, author = {AlSumait, Loulwah and Barbará, Daniel and Domeniconi, Carlotta}, booktitle = {ICDM}, crossref = {conf/icdm/2008}, date = {2009-02-20}, ee = {http://dx.doi.org/10.1109/ICDM.2008.140}, interhash = {e46dde3d53c823afeeb7604f1991b661}, intrahash = {980e5cf0b3db547cf47c2c203734ac83}, pages = {3-12}, publisher = {IEEE Computer Society}, title = {On-line LDA: Adaptive Topic Models for Mining Text Streams with Applications to Topic Detection and Tracking.}, url = {http://dblp.uni-trier.de/db/conf/icdm/icdm2008.html#AlSumaitBD08}, year = 2008 } @article{benyah2009identifying, abstract = {One major goal of text mining is to provide automatic methods to help humans grasp the key ideas in ever-increasing text corpora. To this effect, we propose a statistically well-founded method for identifying the original ideas that a document contributesto a corpus, focusing on self-referential diachronic corpora such as research publications, blogs, email, and news articles.Our statistical model of passage impact defines (interesting) original content through a combination of impact and novelty,and the model is used to identify each document’s most original passages. Unlike heuristic approaches, the statistical modelis extensible and open to analysis. We evaluate the approach both on synthetic data and on real data in the domains of researchpublications and news, showing that the passage impact model outperforms a heuristic baseline method.}, author = {Shaparenko, Benyah and Joachims, Thorsten}, interhash = {7db7a26b3d568e8b405a8079b2c067e8}, intrahash = {c50730e7a14f4a0c76305c44fa2eef2e}, journal = {Machine Learning and Knowledge Discovery in Databases}, pages = {350--365}, title = {Identifying the Original Contribution of a Document via Language Modeling}, url = {http://dx.doi.org/10.1007/978-3-642-04174-7_23}, year = 2009 } @article{pilz-named, author = {Pilz, A. and Paa{\ss}, G. and Augustin, G. St}, interhash = {056d7a3a9a77c31044e996acfb23cc8c}, intrahash = {f956b02691b503629c6742b3e61489da}, title = {{Named Entity Resolution Using Automatically Extracted Semantic Information}}, url = {http://scholar.google.de/scholar.bib?q=info:3tHCLbaX3_sJ:scholar.google.com/&output=citation&hl=de&ct=citation&cd=0}, year = 2009 } @article{journals/jmlr/BanerjeeMDG05, author = {Banerjee, Arindam and Merugu, Srujana and Dhillon, Inderjit S. and Ghosh, Joydeep}, date = {2007-02-21}, ee = {http://www.jmlr.org/papers/v6/banerjee05b.html}, interhash = {50d46127d134382ca84699ce24171c3f}, intrahash = {bba5d5241acf3ec9eea3f869a832c629}, journal = {Journal of Machine Learning Research}, pages = {1705-1749}, title = {Clustering with Bregman Divergences.}, url = {http://dblp.uni-trier.de/db/journals/jmlr/jmlr6.html#BanerjeeMDG05}, volume = 6, year = 2005 } @article{nallapati2008link, author = {Nallapati, R. and Cohen, W.}, booktitle = {International Conference for Weblogs and Social Media}, interhash = {fc5e49ebae9358381b43981b6794158b}, intrahash = {a1f50ddb9b2734e224d4c4ea0140c7cd}, title = {{Link-PLSA-LDA: A new unsupervised model for topics and influence of blogs}}, url = {http://scholar.google.de/scholar.bib?q=info:WFXUMUlnnKwJ:scholar.google.com/&output=citation&hl=de&ct=citation&cd=0}, year = 2008 } @techreport{griffiths02, author = {Griffiths, Tom}, bdsk-url-1 = {www-psych.stanford.edu/~gruffydd/cogsci02/lda.ps}, institution = {Stanford University}, interhash = {6eb20464c0bac4a7081aa6e3f318503a}, intrahash = {9b9261755a207a91b2f646e79cd5f83c}, owner = {heinrich}, timestamp = {2009.04.07}, title = {Gibbs sampling in the generative model of {L}atent {D}irichlet {A}llocation}, url = {/brokenurl#www-psych.stanford.edu/~gruffydd/cogsci02/lda.ps}, year = 2002 } @inproceedings{1529607, abstract = {This paper introduces LDA-G, a scalable Bayesian approach to finding latent group structures in large real-world graph data. Existing Bayesian approaches for group discovery (such as Infinite Relational Models) have only been applied to small graphs with a couple of hundred nodes. LDA-G (short for Latent Dirichlet Allocation for Graphs) utilizes a well-known topic modeling algorithm to find latent group structure. Specifically, we modify Latent Dirichlet Allocation (LDA) to operate on graph data instead of text corpora. Our modifications reflect the differences between real-world graph data and text corpora (e.g., a node's neighbor count vs. a document's word count). In our empirical study, we apply LDA-G to several large graphs (with thousands of nodes) from PubMed (a scientific publication repository). We compare LDA-G's quantitative performance on link prediction with two existing approaches: one Bayesian (namely, Infinite Relational Model) and one non-Bayesian (namely, Cross-association). On average, LDA-G outperforms IRM by 15% and Cross-association by 25% (in terms of area under the ROC curve). Furthermore, we demonstrate that LDA-G can discover useful qualitative information.}, address = {New York, NY, USA}, author = {Henderson, Keith and Eliassi-Rad, Tina}, booktitle = {SAC '09: Proceedings of the 2009 ACM symposium on Applied Computing}, doi = {http://doi.acm.org/10.1145/1529282.1529607}, interhash = {eab56c741afc642e465044326a68def2}, intrahash = {7930085af6a4c3f11f8f2966bd16fd86}, isbn = {978-1-60558-166-8}, location = {Honolulu, Hawaii}, pages = {1456--1461}, publisher = {ACM}, title = {Applying latent dirichlet allocation to group discovery in large graphs}, url = {http://portal.acm.org/citation.cfm?id=1529607}, year = 2009 }