@misc{Rubin2011, abstract = { Machine learning approaches to multi-label document classification have (to date) largely relied on discriminative modeling techniques such as support vector machines. A drawback of these approaches is that performance rapidly drops off as the total number of labels and the number of labels per document increase. This problem is amplified when the label frequencies exhibit the type of highly skewed distributions that are often observed in real-world datasets. In this paper we investigate a class of generative statistical topic models for multi-label documents that associate individual word tokens with different labels. We investigate the advantages of this approach relative to discriminative models, particularly with respect to classification problems involving large numbers of relatively rare labels. We compare the performance of generative and discriminative approaches on document labeling tasks ranging from datasets with several thousand labels to datasets with tens of labels. The experimental results indicate that generative models can achieve competitive multi-label classification performance compared to discriminative methods, and have advantages for datasets with many labels and skewed label frequencies. }, author = {Rubin, Timothy N. and Chambers, America and Smyth, Padhraic and Steyvers, Mark}, interhash = {e09d5d8587756d460a5d834025e75aac}, intrahash = {f8a5a3958ae264d19c7f5415eb7f0bce}, note = {cite arxiv:1107.2462}, title = {Statistical Topic Models for Multi-Label Document Classification}, url = {http://arxiv.org/abs/1107.2462}, year = 2011 } @article{bennett2007workshop, abstract = {The KDD Cup is the oldest of the many data mining competitions that are now popular. It is an integral part of the annual ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD). In 2007, the traditional KDD Cup competition was augmented with a workshop with a focus on the concurrently active Netflix Prize competition. The KDD Cup itself in 2007 consisted of a prediction competition using Netflix movie rating data, with tasks that were different and separate from those being used in the Netflix Prize itself. At the workshop, participants in both the KDD Cup and the Netflix Prize competition presented their results and analyses, and exchanged ideas.}, address = {New York, NY, USA}, author = {Bennett, James and Elkan, Charles and Liu, Bing and Smyth, Padhraic and Tikk, Domonkos}, doi = {10.1145/1345448.1345459}, interhash = {268d74a2e593d3706b883af83e7ad5bd}, intrahash = {83613ba2e5509adfe7497aaeee069149}, issn = {1931-0145}, journal = {SIGKDD Explorations Newsletter}, month = dec, number = 2, pages = {51--52}, publisher = {ACM}, title = {KDD Cup and workshop 2007}, url = {http://doi.acm.org/10.1145/1345448.1345459}, volume = 9, year = 2007 } @incollection{fayyad1996advances, acmid = {257942}, address = {Menlo Park, CA, USA}, author = {Fayyad, Usama M. and Piatetsky-Shapiro, Gregory and Smyth, Padhraic}, chapter = {From data mining to knowledge discovery: an overview}, editor = {Fayyad, Usama M. and Piatetsky-Shapiro, Gregory and Smyth, Padhraic and Uthurusamy, Ramasamy}, interhash = {e62d85a492bbc917f43a5d9c8b775189}, intrahash = {d0b54b224b992e51d892d0f06d45cf6b}, isbn = {0-262-56097-6}, numpages = {34}, pages = {1--34}, publisher = {American Association for Artificial Intelligence}, title = {Advances in knowledge discovery and data mining}, url = {http://portal.acm.org/citation.cfm?id=257938.257942}, year = 1996 } @incollection{Fayyad:1996:DMK:257938.257942, acmid = {257942}, address = {Menlo Park, CA, USA}, author = {Fayyad, Usama M. and Piatetsky-Shapiro, Gregory and Smyth, Padhraic}, chapter = {From data mining to knowledge discovery: an overview}, editor = {Fayyad, Usama M. and Piatetsky-Shapiro, Gregory and Smyth, Padhraic and Uthurusamy, Ramasamy}, interhash = {e62d85a492bbc917f43a5d9c8b775189}, intrahash = {d0b54b224b992e51d892d0f06d45cf6b}, isbn = {0-262-56097-6}, numpages = {34}, pages = {1--34}, publisher = {American Association for Artificial Intelligence}, title = {Advances in knowledge discovery and data mining}, url = {http://portal.acm.org/citation.cfm?id=257938.257942}, year = 1996 } @article{1117458, abstract = {Event-based network data consists of sets of events over time, each of which may involve multiple entities. Examples include email traffic, telephone calls, and research publications (interpreted as co-authorship events). Traditional network analysis techniques, such as social network models, often aggregate the relational information from each event into a single static network. In contrast, in this paper we focus on the temporal nature of such data. In particular, we look at the problems of temporal link prediction and node ranking, and describe new methods that illustrate opportunities for data mining and machine learning techniques in this context. Experimental results are discussed for a large set of co-authorship events measured over multiple years, and a large corporate email data set spanning 21 months.}, address = {New York, NY, USA}, author = {O'Madadhain, Joshua and Hutchins, Jon and Smyth, Padhraic}, doi = {10.1145/1117454.1117458}, interhash = {97a718ab9fe24625f7389939d2608d31}, intrahash = {89a23b31a476c4f3f771b5e3e4a8432c}, issn = {1931-0145}, journal = {SIGKDD Explor. Newsl.}, number = 2, pages = {23--30}, publisher = {ACM}, title = {Prediction and ranking algorithms for event-based network data}, url = {http://portal.acm.org/citation.cfm?id=1117458}, volume = 7, year = 2005 } @inproceedings{porteous_08, address = {New York, NY, USA}, author = {Porteous, Ian and Newman, David and Ihler, Alexander and Asuncion, Arthur and Smyth, Padhraic and Welling, Max}, booktitle = {KDD '08: Proceeding of the 14th ACM SIGKDD international conference on Knowledge discovery and data mining}, doi = {http://doi.acm.org/10.1145/1401890.1401960}, interhash = {d897cc05841b69f83e28f7e1aef4596b}, intrahash = {1107a99a84336cd73fa29a899be81d70}, isbn = {978-1-60558-193-4}, location = {Las Vegas, Nevada, USA}, owner = {gregor}, pages = {569--577}, publisher = {ACM}, timestamp = {2009.04.16}, title = {Fast collapsed {G}ibbs sampling for latent {D}irichlet allocation}, year = 2008 } @incollection{fayyad1996data, abstract = {Data mining and knowledge discovery in databases have been attracting a significant amount of research, industry, and media attention of late. What is all the excitement about? This article provides an overview of this emerging field, clarifying how data mining and knowledge discovery in databases are related both to each other and to related fields, such as machine learning, statistics, and databases. The article mentions particular real-world applications, specific data-mining techniques, challenges involved in real-world applications of knowledge discovery, and current and future research directions in the field.}, address = {Menlo Park, CA, USA}, author = {Fayyad, Usama M. and Piatetsky-Shapiro, Gregory and Smyth, Padhraic}, booktitle = {Advances in knowledge discovery and data mining}, editor = {Fayyad, Usama M. and Piatetsky-Shapiro, Gregory and Smyth, Padhraic and Uthurusamy, Ramasamy}, interhash = {79663e4b1f464b82ce1ae45345dc424f}, intrahash = {3f5a400d01a974f993cee1ac5f79cfc8}, isbn = {0-262-56097-6}, pages = {1--34}, publisher = {American Association for Artificial Intelligence}, title = {From data mining to knowledge discovery: an overview}, url = {http://portal.acm.org/citation.cfm?id=257942}, year = 1996 } @incollection{books/mit/fayyadPSU96/FayyadPS96, author = {Fayyad, Usama M. and Piatetsky-Shapiro, Gregory and Smyth, Padhraic}, booktitle = {Advances in Knowledge Discovery and Data Mining}, date = {2002-01-03}, interhash = {79663e4b1f464b82ce1ae45345dc424f}, intrahash = {e59886c68d1fc9bb4d1a8d6a1a644a60}, pages = {1-34}, title = {From Data Mining to Knowledge Discovery: An Overview.}, url = {http://dblp.uni-trier.de/db/books/collections/fayyad96.html#FayyadPS96}, year = 1996 } @inbook{baldi03modelling, abstract = {Modeling the Internet and the Web covers the most important aspects of modeling the Web using a modern mathematical and probabilistic treatment. It focuses on the information and application layers, as well as some of the emerging properties of the Internet.  Provides a comprehensive introduction to the modeling of the Internet and the Web at the information level.  Takes a modern approach based on mathematical, probabilistic, and graphical modeling.  Provides an integrated presentation of theory, examples, exercises and applications.  Covers key topics such as text analysis, link analysis, crawling techniques, human behaviour, and commerce on the Web. Interdisciplinary in nature, Modeling the Internet and the Web will be of interest to students and researchers from a variety of disciplines including computer science, machine learning, engineering, statistics, economics, business, and the social sciences.}, author = {Baldi, Pierre and Frasconi, Paolo and Smyth, Padhraic}, booktitle = {Modeling the Internet and the Web: Probabilistic Methods and Algorithms}, citeulike-article-id = {822915}, interhash = {416f2405193ae7d30cffe673dee89df2}, intrahash = {3e4e2899e7d6988218d02a264bcfe24a}, month = {April}, priority = {2}, publisher = {Wiley}, title = {Modeling the Internet and the Web: Probabilistic Methods and Algorithms}, url = {http://eu.wiley.com/WileyCDA/WileyTitle/productCd-0470849061.html}, year = 2003 } @inproceedings{citeulike:391307, address = {Arlington, VA, USA}, author = {Rosen-Zvi, Michal and Griffiths, Thomas and Steyvers, Mark and Smyth, Padhraic}, booktitle = {Proceedings of the 20th conference on Uncertainty in artificial intelligence}, citeulike-article-id = {391307}, interhash = {79b4ff1335f13cdbe18a38086e9fab3b}, intrahash = {a4dd688efe5778fb99ff94de104211aa}, isbn = {0974903906}, pages = {487--494}, priority = {0}, publisher = {AUAI Press}, title = {The author-topic model for authors and documents}, url = {http://portal.acm.org/citation.cfm?id=1036843.1036902}, year = 2004 }