|
Rubin, T. N.; Chambers, A.; Smyth, P. & Steyvers, M.
(2011):
Statistical Topic Models for Multi-Label Document Classification.
[Volltext] [Kurzfassung] [BibTeX]
[Endnote]
Machine learning approaches to multi-label document classification have (to date) largely relied on discriminative modeling techniques such as support vector machines. A drawback of these approaches is that performance rapidly drops off as the total number of labels and the number of labels per document increase. This problem is amplified when the label frequencies exhibit the type of highly skewed distributions that are often observed in real-world datasets. In this paper we investigate a class of generative statistical topic models for multi-label documents that associate individual word tokens with different labels. We investigate the advantages of this approach relative to discriminative models, particularly with respect to classification problems involving large numbers of relatively rare labels. We compare the performance of generative and discriminative approaches on document labeling tasks ranging from datasets with several thousand labels to datasets with tens of labels. The experimental results indicate that generative models can achieve competitive multi-label classification performance compared to discriminative methods, and have advantages for datasets with many labels and skewed label frequencies.
@misc{Rubin2011,
author = {Rubin, Timothy N. and Chambers, America and Smyth, Padhraic and Steyvers, Mark},
title = {Statistical Topic Models for Multi-Label Document Classification},
year = {2011},
note = {cite arxiv:1107.2462},
url = {http://arxiv.org/abs/1107.2462},
keywords = {mining, model, text, tm, topic, toread},
abstract = { Machine learning approaches to multi-label document classification have (to date) largely relied on discriminative modeling techniques such as support vector machines. A drawback of these approaches is that performance rapidly drops off as the total number of labels and the number of labels per document increase. This problem is amplified when the label frequencies exhibit the type of highly skewed distributions that are often observed in real-world datasets. In this paper we investigate a class of generative statistical topic models for multi-label documents that associate individual word tokens with different labels. We investigate the advantages of this approach relative to discriminative models, particularly with respect to classification problems involving large numbers of relatively rare labels. We compare the performance of generative and discriminative approaches on document labeling tasks ranging from datasets with several thousand labels to datasets with tens of labels. The experimental results indicate that generative models can achieve competitive multi-label classification performance compared to discriminative methods, and have advantages for datasets with many labels and skewed label frequencies. }
}
%0 = misc
%A = Rubin, Timothy N. and Chambers, America and Smyth, Padhraic and Steyvers, Mark
%B = }
%C =
%D = 2011
%I =
%T = Statistical Topic Models for Multi-Label Document Classification}
%U = http://arxiv.org/abs/1107.2462
|
J |
Carpena, P.; Bernaola-Galván, P.; Hackenberg, M.; Coronado, A. V. & Oliver, J. L.
(2009):
Level statistics of words: Finding keywords in literary texts and symbolic sequences.
In: Physical Review E (Statistical, Nonlinear, and Soft Matter Physics),
Ausgabe/Number: 3,
Vol. 79,
Verlag/Publisher: APS.
Erscheinungsjahr/Year: 2009.
Seiten/Pages: 035102.
[Volltext] [BibTeX]
[Endnote]
@article{carpena:035102,
author = {Carpena, P. and Bernaola-Galván, P. and Hackenberg, M. and Coronado, A. V. and Oliver, J. L.},
title = {Level statistics of words: Finding keywords in literary texts and symbolic sequences},
journal = {Physical Review E (Statistical, Nonlinear, and Soft Matter Physics)},
publisher = {APS},
year = {2009},
volume = {79},
number = {3},
pages = {035102},
url = {http://bioinfo2.ugr.es/TextKeywords/},
doi = {10.1103/PhysRevE.79.035102},
keywords = {analysis, extraction, keyword, statistical, text, tm, topic, toread}
}
%0 = article
%A = Carpena, P. and Bernaola-Galván, P. and Hackenberg, M. and Coronado, A. V. and Oliver, J. L.
%D = 2009
%I = APS
%T = Level statistics of words: Finding keywords in literary texts and symbolic sequences
%U = http://bioinfo2.ugr.es/TextKeywords/
|
P |
Huang, A.; Milne, D. N.; Frank, E. & Witten, I. H.
(2009):
Clustering Documents Using a Wikipedia-Based Concept Representation..
In: PAKDD,
[Volltext]
[BibTeX][Endnote]
@inproceedings{conf/pakdd/HuangMFW09,
author = {Huang, Anna and Milne, David N. and Frank, Eibe and Witten, Ian H.},
title = {Clustering Documents Using a Wikipedia-Based Concept Representation.},
editor = {Theeramunkong, Thanaruk and Kijsirikul, Boonserm and Cercone, Nick and Ho, Tu Bao},
booktitle = {PAKDD},
series = {Lecture Notes in Computer Science},
publisher = {Springer},
year = {2009},
volume = {5476},
pages = {628-636},
url = {http://dblp.uni-trier.de/db/conf/pakdd/pakdd2009.html#HuangMFW09},
isbn = {978-3-642-01306-5},
keywords = {background, clustering, knowledge, ontology, tm, wikipedia}
}
%0 = inproceedings
%A = Huang, Anna and Milne, David N. and Frank, Eibe and Witten, Ian H.
%B = PAKDD
%D = 2009
%I = Springer
%T = Clustering Documents Using a Wikipedia-Based Concept Representation.
%U = http://dblp.uni-trier.de/db/conf/pakdd/pakdd2009.html#HuangMFW09
|
J |
Heyer, G.; Quasthoff, U. & Wittig, T. (Hrsg.)
(2008):
Text Mining: Wissensrohstoff Text.
1. korr. Nachdr.. Aufl./Vol..
Erscheinungsjahr/Year: 2008.
Verlag/Publisher: W3L-Verl.,
Herdecke ; Bochum.
[Volltext] [BibTeX]
[Endnote]
@book{UBMA_280507895,
author = {Heyer, Gerhard and Quasthoff, Uwe and Wittig, Thomas},
title = {Text Mining: Wissensrohstoff Text},
series = {IT lernen},
publisher = {W3L-Verl.},
address = {Herdecke ; Bochum},
year = {2008},
pages = {XII, 348 S.},
edition = {1. korr. Nachdr.},
url = {http://aleph.bib.uni-mannheim.de/F/?func=find-b&request=280507895&find_code=020&adjacent=N&local_base=MAN01PUBLIC&x=0&y=0},
isbn = {978-3-937137-30-8},
keywords = {einführung, mining, text, tm}
}
%0 = book
%A = Heyer, Gerhard and Quasthoff, Uwe and Wittig, Thomas
%C = Herdecke ; Bochum
%D = 2008
%I = W3L-Verl.
%T = Text Mining: Wissensrohstoff Text
%U = http://aleph.bib.uni-mannheim.de/F/?func=find-b&request=280507895&find_code=020&adjacent=N&local_base=MAN01PUBLIC&x=0&y=0
|
J |
(2007):
From Web to Social Web: Discovering and Deploying User and Content Profiles .
Erscheinungsjahr/Year: 2007.
Verlag/Publisher: Springer,
[Volltext] [Kurzfassung] [BibTeX]
[Endnote]
This book constitutes the refereed proceedings of the Workshop on Web Mining, WebMine 2006, held in Berlin, Germany, September 18th, 2006. Topics included are data mining based on analysis of bloggers and tagging, web mining, XML mining and further techniques of knowledge discovery. The book is especially valuable for those interested in the aspects of the Social Web (Web 2.0) and its inherent dynamic and diversity of user-generated content.
@book{Berendt2007,,
title = {From Web to Social Web: Discovering and Deploying User and Content Profiles },
editor = {Berendt, B. and Hotho, A. and Mladenic, D. and Semeraro, G.},
series = {LNCS},
publisher = {Springer},
year = {2007},
volume = {4736},
url = {http://www.springer.com/dal/home?SGWID=1-102-22-173759307-0&changeHeader=true&referer=www.springeronline.com&SHORTCUT=www.springer.com/978-3-540-74950-9},
isbn = {978-3-540-74950-9},
keywords = {2007, data, dm, mining, myown, social, tm, web},
abstract = {This book constitutes the refereed proceedings of the Workshop on Web Mining, WebMine 2006, held in Berlin, Germany, September 18th, 2006. Topics included are data mining based on analysis of bloggers and tagging, web mining, XML mining and further techniques of knowledge discovery. The book is especially valuable for those interested in the aspects of the Social Web (Web 2.0) and its inherent dynamic and diversity of user-generated content.}
}
%0 = book
%D = 2007
%I = Springer
%T = From Web to Social Web: Discovering and Deploying User and Content Profiles
%U = http://www.springer.com/dal/home?SGWID=1-102-22-173759307-0&changeHeader=true&referer=www.springeronline.com&SHORTCUT=www.springer.com/978-3-540-74950-9
|
J |
Feldman, R. & Sanger, J. (Hrsg.)
(2007):
The Text Mining Handbook: Advanced Approaches in Analyzing Unstructured Data.
Erscheinungsjahr/Year: 2007.
Verlag/Publisher: Cambridge University Press,
[Volltext] [BibTeX]
[Endnote]
@book{feldman2006mining,
author = {Feldman, Ronen and Sanger, James},
title = {The Text Mining Handbook: Advanced Approaches in Analyzing Unstructured Data},
publisher = {Cambridge University Press},
year = {2007},
url = {http://www.amazon.com/Text-Mining-Handbook-Approaches-Unstructured/dp/0521836573/ref=sr_1_1?s=books&ie=UTF8&qid=1295265273&sr=1-1},
isbn = {0521836573},
keywords = {mining, text, tm}
}
%0 = book
%A = Feldman, Ronen and Sanger, James
%D = 2007
%I = Cambridge University Press
%T = The Text Mining Handbook: Advanced Approaches in Analyzing Unstructured Data
%U = http://www.amazon.com/Text-Mining-Handbook-Approaches-Unstructured/dp/0521836573/ref=sr_1_1?s=books&ie=UTF8&qid=1295265273&sr=1-1
|
J |
Colas, F. & Brazdil, P.
(2006):
On the Behavior of SVM and Some Older Algorithms in Binary Text Classification Tasks.
In: Text, Speech and Dialogue,
Erscheinungsjahr/Year: 2006.
Seiten/Pages: 45-52.
[Volltext] [Kurzfassung] [BibTeX]
[Endnote]
Document classification has already been widely studied. In fact, some studies compared feature selection techniques or feature
ace transformation whereas some others compared the performance of different algorithms. Recently, following the risinginterest towards the Support Vector Machine, various studies showed that the SVM outperforms other classification algorithms.So should we just not bother about other classification algorithms and opt always for SVM?
@article{colas2006behavior,
author = {Colas, Fabrice and Brazdil, Pavel},
title = {On the Behavior of SVM and Some Older Algorithms in Binary Text Classification Tasks},
journal = {Text, Speech and Dialogue},
year = {2006},
pages = {45--52},
url = {http://dx.doi.org/10.1007/11846406_6},
keywords = {classification, knn, nb, preprocessing, svm, text, tm, toread},
abstract = {Document classification has already been widely studied. In fact, some studies compared feature selection techniques or feature
space transformation whereas some others compared the performance of different algorithms. Recently, following the risinginterest towards the Support Vector Machine, various studies showed that the SVM outperforms other classification algorithms.So should we just not bother about other classification algorithms and opt always for SVM?}
}
%0 = article
%A = Colas, Fabrice and Brazdil, Pavel
%D = 2006
%T = On the Behavior of SVM and Some Older Algorithms in Binary Text Classification Tasks
%U = http://dx.doi.org/10.1007/11846406_6
|
J |
Crane, G.
(2006):
What Do You Do with a Million Books?.
In: D-Lib Magazine,
Ausgabe/Number: 3,
Vol. 12,
Erscheinungsjahr/Year: 2006.
[Volltext] [BibTeX]
[Endnote]
@article{march06crane,
author = {Crane, Gregory},
title = {What Do You Do with a Million Books?},
journal = {D-Lib Magazine},
year = {2006},
volume = {12},
number = {3},
url = {http://www.dlib.org/dlib/march06/crane/03crane.html},
doi = {10.1045/march2006-crane},
issn = {1082-9873},
keywords = {Book, Mining, Text, google, tm, toread}
}
%0 = article
%A = Crane, Gregory
%D = 2006
%T = What Do You Do with a Million Books?
%U = http://www.dlib.org/dlib/march06/crane/03crane.html
|
J |
Hotho, A.; Nürnberger, A. & Paaß, G.
(2005):
A Brief Survey of Text Mining.
In: LDV Forum - GLDV Journal for Computational Linguistics and Language Technology,
Ausgabe/Number: 1,
Vol. 20,
Erscheinungsjahr/Year: 2005.
Seiten/Pages: 19-62.
[Volltext] [BibTeX]
[Endnote]
@article{hotho-etal-ldv-2005,
author = {Hotho, Andreas and Nürnberger, Andreas and Paaß, Gerhard},
title = { A Brief Survey of Text Mining},
journal = {LDV Forum - GLDV Journal for Computational Linguistics and Language Technology},
year = {2005},
volume = {20},
number = {1},
pages = {19-62},
url = {http://www.kde.cs.uni-kassel.de/hotho/pub/2005/hotho05TextMining.pdf},
issn = {0175-1336},
keywords = {2005, SumSchool06, mining, myown, ontology, overview, survey, text, tm}
}
%0 = article
%A = Hotho, Andreas and Nürnberger, Andreas and Paaß, Gerhard
%D = 2005
%T = A Brief Survey of Text Mining
%U = http://www.kde.cs.uni-kassel.de/hotho/pub/2005/hotho05TextMining.pdf
|
J |
Weiss, S. M.; Indurkhya, N. & Zhang, T. (Hrsg.)
(2004):
Text Mining. Predictive Methods for Analyzing Unstructured Information.
1. Aufl./Vol..
Erscheinungsjahr/Year: 2004.
Verlag/Publisher: Springer, Berlin,
[Volltext] [BibTeX]
[Endnote]
@book{0387954333,
author = {Weiss, Sholom M. and Indurkhya, Nitin and Zhang, T.},
title = {Text Mining. Predictive Methods for Analyzing Unstructured Information},
publisher = {Springer, Berlin},
year = {2004},
edition = {1},
url = {http://www.amazon.de/gp/redirect.html%3FASIN=0387954333%26tag=ws%26lcode=xm2%26cID=2025%26ccmID=165953%26location=/o/ASIN/0387954333%253FSubscriptionId=13CT5CVB80YFWJEPWS02},
isbn = {0387954333},
keywords = {dm, mining, nlp, software, text, tm}
}
%0 = book
%A = Weiss, Sholom M. and Indurkhya, Nitin and Zhang, T.
%D = 2004
%I = Springer, Berlin
%T = Text Mining. Predictive Methods for Analyzing Unstructured Information
%U = http://www.amazon.de/gp/redirect.html%3FASIN=0387954333%26tag=ws%26lcode=xm2%26cID=2025%26ccmID=165953%26location=/o/ASIN/0387954333%253FSubscriptionId=13CT5CVB80YFWJEPWS02
|
J |
Zhai, C. & Lafferty, J.
(2004):
A study of smoothing methods for language models applied to information retrieval.
In: ACM Trans. Inf. Syst.,
Ausgabe/Number: 2,
Vol. 22,
Verlag/Publisher: ACM Press.
Erscheinungsjahr/Year: 2004.
Seiten/Pages: 179-214.
[Volltext] [Kurzfassung] [BibTeX]
[Endnote]
Language modeling approaches to information retrieval are attractive and promising because they connect the problem of retrieval with that of language model estimation, which has been studied extensively in other application areas such as speech recognition. The basic idea of these approaches is to estimate a language model for each document, and to then rank documents by the likelihood of the query according to the estimated language model. A central issue in language model estimation is smoothing, the problem of adjusting the maximum likelihood estimator to compensate for data sparseness. In this article, we study the problem of language model smoothing and its influence on retrieval performance. We examine the sensitivity of retrieval performance to the smoothing parameters and compare several popular smoothing methods on different test collections. Experimental results show that not only is the retrieval performance generally sensitive to the smoothing parameters, but also the sensitivity pattern is affected by the query type, with performance being more sensitive to smoothing for verbose queries than for keyword queries. Verbose queries also generally require more aggressive smoothing to achieve optimal performance. This suggests that smoothing plays two different role--to make the estimated document language model more accurate and to "explain" the noninformative words in the query. In order to decouple these two distinct roles of smoothing, we propose a two-stage smoothing strategy, which yields better sensitivity patterns and facilitates the setting of smoothing parameters automatically. We further propose methods for estimating the smoothing parameters automatically. Evaluation on five different databases and four types of queries indicates that the two-stage smoothing method with the proposed parameter estimation methods consistently gives retrieval performance that is close to--or better than--the best results achieved using a single smoothing method and exhaustive parameter search on the test data.
@article{984322,
author = {Zhai, Chengxiang and Lafferty, John},
title = {A study of smoothing methods for language models applied to information retrieval},
journal = {ACM Trans. Inf. Syst.},
publisher = {ACM Press},
address = {New York, NY, USA},
year = {2004},
volume = {22},
number = {2},
pages = {179--214},
url = {http://portal.acm.org/citation.cfm?id=984322},
doi = {http://doi.acm.org/10.1145/984321.984322},
issn = {1046-8188},
keywords = {ir, model, text, tm},
abstract = {Language modeling approaches to information retrieval are attractive and promising because they connect the problem of retrieval with that of language model estimation, which has been studied extensively in other application areas such as speech recognition. The basic idea of these approaches is to estimate a language model for each document, and to then rank documents by the likelihood of the query according to the estimated language model. A central issue in language model estimation is smoothing, the problem of adjusting the maximum likelihood estimator to compensate for data sparseness. In this article, we study the problem of language model smoothing and its influence on retrieval performance. We examine the sensitivity of retrieval performance to the smoothing parameters and compare several popular smoothing methods on different test collections. Experimental results show that not only is the retrieval performance generally sensitive to the smoothing parameters, but also the sensitivity pattern is affected by the query type, with performance being more sensitive to smoothing for verbose queries than for keyword queries. Verbose queries also generally require more aggressive smoothing to achieve optimal performance. This suggests that smoothing plays two different role---to make the estimated document language model more accurate and to "explain" the noninformative words in the query. In order to decouple these two distinct roles of smoothing, we propose a two-stage smoothing strategy, which yields better sensitivity patterns and facilitates the setting of smoothing parameters automatically. We further propose methods for estimating the smoothing parameters automatically. Evaluation on five different databases and four types of queries indicates that the two-stage smoothing method with the proposed parameter estimation methods consistently gives retrieval performance that is close to---or better than---the best results achieved using a single smoothing method and exhaustive parameter search on the test data.}
}
%0 = article
%A = Zhai, Chengxiang and Lafferty, John
%C = New York, NY, USA
%D = 2004
%I = ACM Press
%T = A study of smoothing methods for language models applied to information retrieval
%U = http://portal.acm.org/citation.cfm?id=984322
|
I |
Hotho, A.; Maedche, A.; Staab, S. & Zacharias, V.
(2003):
On Knowledgeable Unsupervised Text Mining.
In: Text Mining.
Erscheinungsjahr/Year: 2003.
Seiten/Pages: 131-152.
[BibTeX]
[Endnote]
@incollection{hotho03tm,
author = {Hotho, Andreas and Maedche, Alexander and Staab, Steffen and Zacharias, Valentin},
title = {On Knowledgeable Unsupervised Text Mining},
booktitle = {Text Mining},
year = {2003},
pages = {131-152},
keywords = {2003, myown, tm}
}
%0 = incollection
%A = Hotho, Andreas and Maedche, Alexander and Staab, Steffen and Zacharias, Valentin
%B = Text Mining
%D = 2003
%T = On Knowledgeable Unsupervised Text Mining
|
P |
Popescul, A.; Ungar, L. H.; Lawrence, S. & Pennock, D. M.
(2003):
Statistical Relational Learning for Document Mining..
In: ICDM,
[Volltext]
[BibTeX][Endnote]
@inproceedings{conf/icdm/PopesculULP03,
author = {Popescul, Alexandrin and Ungar, Lyle H. and Lawrence, Steve and Pennock, David M.},
title = {Statistical Relational Learning for Document Mining.},
booktitle = {ICDM},
publisher = {IEEE Computer Society},
year = {2003},
pages = {275-282},
url = {http://www.cis.upenn.edu/~popescul/Publications/popescul03dm.pdf},
isbn = {0-7695-1978-4},
keywords = {2003, classification, document, mining, srl, text, tm}
}
%0 = inproceedings
%A = Popescul, Alexandrin and Ungar, Lyle H. and Lawrence, Steve and Pennock, David M.
%B = ICDM
%D = 2003
%I = IEEE Computer Society
%T = Statistical Relational Learning for Document Mining.
%U = http://www.cis.upenn.edu/~popescul/Publications/popescul03dm.pdf
|
J |
Sebastiani, F.
(2002):
Machine learning in automated text categorization.
In: ACM Computing Surveys,
Ausgabe/Number: 1,
Vol. 34,
Erscheinungsjahr/Year: 2002.
Seiten/Pages: 1-47.
[Volltext] [BibTeX]
[Endnote]
@article{Sebastiani02,
author = {Sebastiani, F.},
title = {Machine learning in automated text categorization},
journal = {ACM Computing Surveys},
year = {2002},
volume = {34},
number = {1},
pages = {1--47},
url = {http://faure.iei.pi.cnr.it/~fabrizio/Publications/ACMCS02.pdf},
keywords = {tm, text, survey, classification, categorization, ml}
}
%0 = article
%A = Sebastiani, F.
%D = 2002
%T = Machine learning in automated text categorization
%U = http://faure.iei.pi.cnr.it/~fabrizio/Publications/ACMCS02.pdf
|
P |
Hotho, A.; Maedche, A. & Staab, S.
(2001):
Text Clustering Based on Good Aggregations.
In: ICDM '01: Proceedings of the 2001 IEEE International Conference on Data Mining,
Washington, DC, USA.
[Volltext]
[BibTeX][Endnote]
@inproceedings{658040,
author = {Hotho, Andreas and Maedche, Alexander and Staab, Steffen},
title = {Text Clustering Based on Good Aggregations},
booktitle = {ICDM '01: Proceedings of the 2001 IEEE International Conference on Data Mining},
publisher = {IEEE Computer Society},
address = {Washington, DC, USA},
year = {2001},
pages = {607--608},
url = {http://portal.acm.org/citation.cfm?id=658040},
isbn = {0-7695-1119-8},
keywords = {2001, clustering, gruppenbildung, kmeans, myown, ontology, text, tm}
}
%0 = inproceedings
%A = Hotho, Andreas and Maedche, Alexander and Staab, Steffen
%B = ICDM '01: Proceedings of the 2001 IEEE International Conference on Data Mining
%C = Washington, DC, USA
%D = 2001
%I = IEEE Computer Society
%T = Text Clustering Based on Good Aggregations
%U = http://portal.acm.org/citation.cfm?id=658040
|
P |
Feldman, R. & Dagan, I.
(1995):
Knowledge Discovery in Textual Databases (KDT).
In: Proc. of the First Int. Conf. on Knowledge Discovery (KDD),
[BibTeX][Endnote]
@inproceedings{feldman95KDT,
author = {Feldman, R. and Dagan, I.},
title = {Knowledge Discovery in Textual Databases (KDT)},
booktitle = {Proc. of the First Int. Conf. on Knowledge Discovery (KDD)},
type = {InProceedings},
year = {1995},
pages = {112-117},
keywords = {mining, text, tm}
}
%0 = inproceedings
%A = Feldman, R. and Dagan, I.
%B = Proc. of the First Int. Conf. on Knowledge Discovery (KDD)
%D = 1995
%T = Knowledge Discovery in Textual Databases (KDT)
|