@inproceedings{Shen:2004:WCT:1008992.1009035, abstract = {Web-page classification is much more difficult than pure-text classification due to a large variety of noisy information embedded in Web pages. In this paper, we propose a new Web-page classification algorithm based on Web summarization for improving the accuracy. We first give empirical evidence that ideal Web-page summaries generated by human editors can indeed improve the performance of Web-page classification algorithms. We then propose a new Web summarization-based classification algorithm and evaluate it along with several other state-of-the-art text summarization algorithms on the LookSmart Web directory. Experimental results show that our proposed summarization-based classification algorithm achieves an approximately 8.8% improvement as compared to pure-text-based classification algorithm. We further introduce an ensemble classifier using the improved summarization algorithm and show that it achieves about 12.9% improvement over pure-text based methods.}, acmid = {1009035}, address = {New York, NY, USA}, author = {Shen, Dou and Chen, Zheng and Yang, Qiang and Zeng, Hua-Jun and Zhang, Benyu and Lu, Yuchang and Ma, Wei-Ying}, booktitle = {Proceedings of the 27th annual international ACM SIGIR conference on Research and development in information retrieval}, doi = {10.1145/1008992.1009035}, interhash = {328ff5b51cb573cd1d253f339892c029}, intrahash = {b83fca9d43e5afdea78b9791cc07890c}, isbn = {1-58113-881-4}, location = {Sheffield, United Kingdom}, numpages = {8}, pages = {242--249}, publisher = {ACM}, series = {SIGIR '04}, title = {Web-page classification through summarization}, url = {http://doi.acm.org/10.1145/1008992.1009035}, year = 2004 } @inproceedings{hu2008enhancing, author = {Hu, Jian and Fang, Lujun and Cao, Yang and Zeng, Hua-Jun and Li, Hua and Yang, Qiang and Chen, Zheng}, booktitle = {SIGIR}, crossref = {conf/sigir/2008}, editor = {Myaeng, Sung-Hyon and Oard, Douglas W. and Sebastiani, Fabrizio and Chua, Tat-Seng and Leong, Mun-Kew}, ee = {http://doi.acm.org/10.1145/1390334.1390367}, interhash = {0a2878165034dcdfacb9045608ec482a}, intrahash = {76f863a12c0b983ec67682deaec1ada4}, isbn = {978-1-60558-164-4}, pages = {179-186}, publisher = {ACM}, title = {Enhancing text clustering by leveraging Wikipedia semantics.}, url = {http://dblp.uni-trier.de/db/conf/sigir/sigir2008.html#HuFCZLYC08}, year = 2008 } @inproceedings{conf/sigir/HuFCZLYC08, author = {Hu, Jian and Fang, Lujun and Cao, Yang and Zeng, Hua-Jun and Li, Hua and Yang, Qiang and Chen, Zheng}, booktitle = {SIGIR}, crossref = {conf/sigir/2008}, date = {2008-07-27}, editor = {Myaeng, Sung-Hyon and Oard, Douglas W. and Sebastiani, Fabrizio and Chua, Tat-Seng and Leong, Mun-Kew}, ee = {http://doi.acm.org/10.1145/1390334.1390367}, interhash = {0a2878165034dcdfacb9045608ec482a}, intrahash = {76f863a12c0b983ec67682deaec1ada4}, isbn = {978-1-60558-164-4}, pages = {179-186}, publisher = {ACM}, title = {Enhancing text clustering by leveraging Wikipedia semantics.}, url = {http://dblp.uni-trier.de/db/conf/sigir/sigir2008.html#HuFCZLYC08}, year = 2008 } @article{wu2008wu, abstract = {This paper presents the top 10 data mining algorithms identified by the IEEE International Conference on Data Mining (ICDM) in December 2006: C4.5, k-Means, SVM, Apriori, EM, PageRank, AdaBoost, kNN, Naive Bayes, and CART. These top 10 algorithms are among the most influential data mining algorithms in the research community.With each algorithm, we provide a description of the algorithm, discuss the impact of the algorithm, and review current andfurther research on the algorithm. These 10 algorithms cover classification, clustering, statistical learning, associationanalysis, and link mining, which are all among the most important topics in data mining research and development.}, address = {London}, author = {Wu, Xindong and Kumar, Vipin and Quinlan, J. Ross and Ghosh, Joydeep and Yang, Qiang and Motoda, Hiroshi and McLachlan, Geoffrey and Ng, Angus and Liu, Bing and Yu, Philip and Zhou, Zhi-Hua and Steinbach, Michael and Hand, David and Steinberg, Dan}, interhash = {76fd294a34cf85638f6e194a85af8db9}, intrahash = {2c34bb4b49187a6d3e780e78d254ae1f}, issn = {0219-1377}, journal = {Knowledge and Information Systems}, month = Jan, number = 1, pages = {1--37}, publisher = {Springer}, title = {Top 10 algorithms in data mining}, url = {http://dx.doi.org/10.1007/s10115-007-0114-2}, volume = 14, year = 2008 }