@inproceedings{conf/icdm/YassineH10, author = {Yassine, Mohamed and Hajj, Hazem}, booktitle = {ICDM Workshops}, crossref = {conf/icdm/2010w}, editor = {Fan, Wei and Hsu, Wynne and Webb, Geoffrey I. and Liu, Bing and Zhang, Chengqi and Gunopulos, Dimitrios and Wu, Xindong}, ee = {http://dx.doi.org/10.1109/ICDMW.2010.75}, interhash = {72ae8c258d6559e4a90370453ecc2acc}, intrahash = {8b0afeee143cec94f3058c214ae38c6f}, pages = {1136-1142}, publisher = {IEEE Computer Society}, title = {A Framework for Emotion Mining from Text in Online Social Networks.}, url = {http://dblp.uni-trier.de/db/conf/icdm/icdmw2010.html#YassineH10}, year = 2010 } @misc{Rubin2011, abstract = { Machine learning approaches to multi-label document classification have (to date) largely relied on discriminative modeling techniques such as support vector machines. A drawback of these approaches is that performance rapidly drops off as the total number of labels and the number of labels per document increase. This problem is amplified when the label frequencies exhibit the type of highly skewed distributions that are often observed in real-world datasets. In this paper we investigate a class of generative statistical topic models for multi-label documents that associate individual word tokens with different labels. We investigate the advantages of this approach relative to discriminative models, particularly with respect to classification problems involving large numbers of relatively rare labels. We compare the performance of generative and discriminative approaches on document labeling tasks ranging from datasets with several thousand labels to datasets with tens of labels. The experimental results indicate that generative models can achieve competitive multi-label classification performance compared to discriminative methods, and have advantages for datasets with many labels and skewed label frequencies. }, author = {Rubin, Timothy N. and Chambers, America and Smyth, Padhraic and Steyvers, Mark}, interhash = {e09d5d8587756d460a5d834025e75aac}, intrahash = {f8a5a3958ae264d19c7f5415eb7f0bce}, note = {cite arxiv:1107.2462}, title = {Statistical Topic Models for Multi-Label Document Classification}, url = {http://arxiv.org/abs/1107.2462}, year = 2011 } @book{srivastava2009mining, abstract = {Giving a broad perspective of the field from numerous vantage points, 'Text Mining' focuses on statistical methods for text mining and analysis. It examines methods to automatically cluster and classify text documents and applies these methods in a variety of areas.}, address = {Boca Raton, FL}, author = {Srivastava, Asho and Sahami, Mehran.}, interhash = {290eabe518274b6fbcc73a106a7d52a6}, intrahash = {45ab79501c114299142864becfa6c841}, isbn = {9781420059403 1420059408}, pages = {--}, publisher = {CRC Press}, refid = {144226505}, title = {Text mining : classification, clustering, and applications}, url = {http://www.worldcat.org/search?qt=worldcat_org_all&q=9781420059403}, year = 2009 } @article{march06crane, author = {Crane, Gregory}, doi = {10.1045/march2006-crane}, interhash = {36d4825e3189d89195693d1449e9aaea}, intrahash = {eea7ae2ac1480c84f87544f2942c28f2}, issn = {1082-9873}, journal = {D-Lib Magazine}, month = {March }, number = 3, title = {What Do You Do with a Million Books?}, url = {http://www.dlib.org/dlib/march06/crane/03crane.html}, volume = 12, year = 2006 } @article{Pang.Lee2008OpinionMiningand, abstract = {An important part of our information-gathering behavior has always been to find out what other people think. With the growing availability and popularity of opinion-rich resources such as online review sites and personal blogs, new opportunities and challenges arise as people now can, and do, actively use information technologies to seek out and understand the opinions of others. The sudden eruption of activity in the area of opinion mining and sentiment analysis, which deals with the computational treatment of opinion, sentiment, and subjectivity in text, has thus occurred at least in part as a direct response to the surge of interest in new systems that deal directly with opinions as a first-class object. This survey covers techniques and approaches that promise to directly enable opinion-oriented information-seeking systems. Our focus is on methods that seek to address the new challenges raised by sentiment-aware applications, as compared to those that are already present in more traditional fact-based analysis. We include material on summarization of evaluative text and on broader issues regarding privacy, manipulation, and economic impact that the development of opinion-oriented information-access services gives rise to. To facilitate future work, a discussion of available resources, benchmark datasets, and evaluation campaigns is also provided. }, author = {Pang, B. and Lee, L.}, doi = {10.1561/1500000001}, interhash = {7bfd8b20ea5f9fb76e96d71c3155c50c}, intrahash = {60ec6588322693f7636f8cd3bc820783}, jlprojects = {cyberemotions}, journal = {Information Retrieval}, number = {1-2}, pages = {1--135}, title = {{Opinion Mining and Sentiment Analysis}}, volume = 2, year = 2008 } @article{1324190, address = {New York, NY, USA}, author = {Stavrianou, Anna and Andritsos, Periklis and Nicoloyannis, Nicolas}, doi = {http://doi.acm.org/10.1145/1324185.1324190}, interhash = {bde58d2eeb65f2194171f93b0e1f2a21}, intrahash = {d8c54095392c0e83ab4f50f694d3b1f3}, issn = {0163-5808}, journal = {SIGMOD Rec.}, number = 3, pages = {23--34}, publisher = {ACM}, title = {Overview and semantic issues of text mining}, url = {http://portal.acm.org/citation.cfm?id=1324190}, volume = 36, year = 2007 } @article{Pang2008, author = {Pang, Bo and Lee, Lillian}, date = {July 2008}, interhash = {7bfd8b20ea5f9fb76e96d71c3155c50c}, intrahash = {236d4f703fda3dd9457863f28eda56cb}, isbn = {978-1-60198-150-9}, journal = {Foundations and Trends® in Information Retrieval}, number = {1-2}, pages = {1-135}, tech = {Now publishers}, title = {Opinion mining and sentiment analysis}, url = {http://www.cs.cornell.edu/home/llee/omsa/omsa-published.pdf}, volume = 2, year = 2008 } @inproceedings{Chakrabarti:2004, author = {Chakrabarti, D. and Zhan, Y. and Faloutsos, C.}, booktitle = {SIAM International Conference on Data Mining}, interhash = {d7719c6e919fbb8a37e09464f12988b6}, intrahash = {5e5cc221d7da719909f3bf8c507b0afc}, title = {R-MAT: A Recursive Model for Graph Mining}, url = {http://www.cs.cmu.edu/~christos/PUBLICATIONS/siam04.pdf}, year = 2004 } @article{kostoff, abstract = {Literature-related discovery (LRD) is the linking of two or more literature concepts that have heretofore not been linked (i.e., disjoint), in order to produce novel, interesting, plausible, and intelligible knowledge (i.e., potential discovery). The open discovery systems (ODS) component of LRD starts with a problem to be solved, and generates solutions to that problem through potential discovery. We have been using ODS LRD to identify potential treatments or preventative actions for challenging medical problems, among myriad other applications. This paper describes the second medical problem we addressed (cataract) using ODS LRD; the first problem addressed was Raynaud's Phenomenon (RP), and was described in the third paper of this Special Issue. Cataract was selected because it is ubiquitous globally, appears intractable to all forms of treatment other than surgical removal of cataracts, and is a major cause of blindness in many developing countries. The ODS LRD study had three objectives: a) identify non-drug non-surgical treatments that would 1) help prevent cataracts, or 2) reduce the progression rate of cataracts, or 3) stop the progression of cataracts, or 4) maybe even reverse the progression of cataracts; b) demonstrate that we could solve an ODS LRD problem with no prior knowledge of any results or prior work (unlike the case with the RP problem); c) determine whether large time savings in the discovery process were possible relative to the time required for conducting the RP study. To that end, we used the MeSH taxonomy of MEDLINE to restrict potential discoveries to selected semantic classes, as a substitute for the manually-intensive process used in the RP study to restrict potential discoveries to selected semantic classes. We also used additional semantic filtering to identify potential discovery within the selected semantic classes. All these goals were achieved. As will be shown, we generated large amounts of potential discovery in more than an order of magnitude less time than required for the RP study. We identified many non-drug non-surgical treatments that may be able to reduce or even stop the progression rate of cataracts. Time, and much testing, will determine whether this is possible. Finally, the methodology has been developed to the point where ODS LRD problems can be solved with no results or knowledge of any prior work.}, author = {Kostoff, Ronald N.}, interhash = {45ce0cd73dd62182ce1e447ba9fe71eb}, intrahash = {b9359f79985da9b9677340ffda849e74}, journal = {Technological Forecasting and Social Change}, pages = {--}, title = {Literature-related discovery (LRD): Potential treatments for cataracts}, url = {http://www.sciencedirect.com/science/article/B6V71-4RDB8SC-9/2/8991fe8968a0ef12f22ed7e9ac9d7c4f}, volume = {In Press, Corrected Proof}, year = 2007 } @inproceedings{conf/das/SchenkerBLK04, author = {Schenker, Adam and Bunke, Horst and Last, Mark and Kandel, Abraham}, booktitle = {Document Analysis Systems}, crossref = {conf/das/2004}, date = {2005-01-05}, editor = {Marinai, Simone and Dengel, Andreas}, ee = {http://springerlink.metapress.com/openurl.asp?genre=article&issn=0302-9743&volume=3163&spage=401}, interhash = {83ba06e8918a227fb2345e047e40f619}, intrahash = {4450261ce5af13db99ce208800dff22c}, isbn = {3-540-23060-2}, pages = {401-412}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, title = {A Graph-Based Framework for Web Document Mining.}, url = {http://dblp.uni-trier.de/db/conf/das/das2004.html#SchenkerBLK04}, volume = 3163, year = 2004 } @article{keyhere, asin = {9812563393}, author = {Schenker, Adam and Bunke, Horst and Last, Mark and Kandel, Abraham}, interhash = {247e95a6025dff9119c7943b5a33f917}, intrahash = {3f9897fc8abcf1bcb1fd0212a23a4134}, isbn = {9812563393}, title = {Graph-Theoretic Techniques for Web Content Mining}, typesource = {Simple CitationSource}, url = {http://www.amazon.ca/Graph-Theoretic-Techniques-Web-Content-Mining/dp/9812563393/ref=sr_1_7/701-3503486-7337153?ie=UTF8&s=books&qid=1175673405&sr=1-7}, year = 2005 } @misc{web2006witten, author = {Witten, Ian and Gori, Marco and Numerico, Teresa}, interhash = {8e54756085b574381d8f35d20dc989c7}, intrahash = {26dd2b2627dc5ee6100840328e0c20f0}, isbn = {0-12-370609-2}, title = {Web Dragons: Inside the Myths of Search Engine Technology }, year = 2006 } @article{han98hypergraph, author = {Han, Eui-Hong and Karypis, George and Kumar, Vipin and Mobasher, Bamshad}, interhash = {3bb7fb3fd3af41fac2db5460a5acfd2c}, intrahash = {9723b092d975dedb8f6d5f711bb00ffd}, journal = {Data Engineering Bulletin}, number = 1, pages = {15-22}, title = {Hypergraph Based Clustering in High-Dimensional Data Sets: A Summary of Results}, url = {http://citeseer.ist.psu.edu/han98hypergraph.html}, volume = 21, year = 1998 }