@inproceedings{Shen:2004:WCT:1008992.1009035, abstract = {Web-page classification is much more difficult than pure-text classification due to a large variety of noisy information embedded in Web pages. In this paper, we propose a new Web-page classification algorithm based on Web summarization for improving the accuracy. We first give empirical evidence that ideal Web-page summaries generated by human editors can indeed improve the performance of Web-page classification algorithms. We then propose a new Web summarization-based classification algorithm and evaluate it along with several other state-of-the-art text summarization algorithms on the LookSmart Web directory. Experimental results show that our proposed summarization-based classification algorithm achieves an approximately 8.8% improvement as compared to pure-text-based classification algorithm. We further introduce an ensemble classifier using the improved summarization algorithm and show that it achieves about 12.9% improvement over pure-text based methods.}, acmid = {1009035}, address = {New York, NY, USA}, author = {Shen, Dou and Chen, Zheng and Yang, Qiang and Zeng, Hua-Jun and Zhang, Benyu and Lu, Yuchang and Ma, Wei-Ying}, booktitle = {Proceedings of the 27th annual international ACM SIGIR conference on Research and development in information retrieval}, doi = {10.1145/1008992.1009035}, interhash = {328ff5b51cb573cd1d253f339892c029}, intrahash = {b83fca9d43e5afdea78b9791cc07890c}, isbn = {1-58113-881-4}, location = {Sheffield, United Kingdom}, numpages = {8}, pages = {242--249}, publisher = {ACM}, series = {SIGIR '04}, title = {Web-page classification through summarization}, url = {http://doi.acm.org/10.1145/1008992.1009035}, year = 2004 } @inproceedings{liu2005experimental, abstract = {Taxonomies of the Web typically have hundreds of thousands of categories and skewed category distribution over documents. It is not clear whether existing text classification technologies can perform well on and scale up to such large-scale applications. To understand this, we conducted the evaluation of several representative methods (Support Vector Machines, k-Nearest Neighbor and Naive Bayes) with Yahoo! taxonomies. In particular, we evaluated the effectiveness/efficiency tradeoff in classifiers with hierarchical setting compared to conventional (flat) setting, and tested popular threshold tuning strategies for their scalability and accuracy in large-scale classification problems.}, acmid = {1062891}, address = {New York, NY, USA}, author = {LIU, Tie-Yan and YANG, Yiming and WAN, Hao and ZHOU, Qian and GAO, Bin and ZENG, Hua-Jun and CHEN, Zheng and MA, Wei-Ying}, booktitle = {Special interest tracks and posters of the 14th international conference on World Wide Web}, doi = {10.1145/1062745.1062891}, interhash = {e581e4dd2ed6d748031a812c724c4b7c}, intrahash = {36cc9f92b9c722b2aff441b23e44b2f7}, isbn = {1-59593-051-5}, location = {Chiba, Japan}, numpages = {2}, pages = {1106--1107}, publisher = {ACM}, series = {WWW '05}, title = {An experimental study on large-scale web categorization}, url = {http://doi.acm.org/10.1145/1062745.1062891}, year = 2005 } @inproceedings{zhao2006timedependent, abstract = {It has become a promising direction to measure similarity of Web search queries by mining the increasing amount of click-through data logged by Web search engines, which record the interactions between users and the search engines. Most existing approaches employ the click-through data for similarity measure of queries with little consideration of the temporal factor, while the click-through data is often dynamic and contains rich temporal information. In this paper we present a new framework of time-dependent query semantic similarity model on exploiting the temporal characteristics of historical click-through data. The intuition is that more accurate semantic similarity values between queries can be obtained by taking into account the timestamps of the log data. With a set of user-defined calendar schema and calendar patterns, our time-dependent query similarity model is constructed using the marginalized kernel technique, which can exploit both explicit similarity and implicit semantics from the click-through data effectively. Experimental results on a large set of click-through data acquired from a commercial search engine show that our time-dependent query similarity model is more accurate than the existing approaches. Moreover, we observe that our time-dependent query similarity model can, to some extent, reflect real-world semantics such as real-world events that are happening over time.}, address = {New York, NY, USA}, author = {Zhao, Qiankun and Hoi, Steven C. H. and Liu, Tie-Yan and Bhowmick, Sourav S. and Lyu, Michael R. and Ma, Wei-Ying}, booktitle = {WWW '06: Proceedings of the 15th international conference on World Wide Web}, doi = {10.1145/1135777.1135858}, interhash = {c765e101c37f6b530e2c1c59808048d7}, intrahash = {57cbc64550d3a1b5b8599a0783e95111}, isbn = {1-59593-323-9}, location = {Edinburgh, Scotland}, pages = {543--552}, publisher = {ACM}, title = {Time-dependent semantic similarity measure of queries using historical click-through data}, url = {http://portal.acm.org/citation.cfm?id=1135777.1135858}, year = 2006 } @inproceedings{wu2007visual, address = {New York, NY, USA}, author = {Wu, Lei and Li, Mingjing and Li, Zhiwei and Ma, Wei-Ying and Yu, Nenghai}, booktitle = {MIR '07: Proceedings of the international workshop on Workshop on multimedia information retrieval}, doi = {http://doi.acm.org/10.1145/1290082.1290101}, interhash = {d03d88ef516f46e0948878410f7e228f}, intrahash = {9924cd30ab804a299a23225823527c83}, isbn = {978-1-59593-778-0}, location = {Augsburg, Bavaria, Germany}, pages = {115--124}, publisher = {ACM}, title = {Visual language modeling for image classification}, url = {http://portal.acm.org/citation.cfm?id=1290101}, year = 2007 } @inproceedings{wu2008flickr, address = {New York, NY, USA}, author = {Wu, Lei and Hua, Xian-Sheng and Yu, Nenghai and Ma, Wei-Ying and Li, Shipeng}, booktitle = {MM '08: Proceeding of the 16th ACM international conference on Multimedia}, doi = {http://doi.acm.org/10.1145/1459359.1459364}, interhash = {7aeae0773262c83a7efd2f0757ec5290}, intrahash = {f8f536ebee1f06fd53bc8b28f7f124c0}, isbn = {978-1-60558-303-7}, location = {Vancouver, British Columbia, Canada}, pages = {31--40}, publisher = {ACM}, title = {Flickr distance}, url = {http://portal.acm.org/citation.cfm?doid=1459359.1459364}, year = 2008 } @inproceedings{zhao2006timedependent, abstract = {It has become a promising direction to measure similarity of Web search queries by mining the increasing amount of click-through data logged by Web search engines, which record the interactions between users and the search engines. Most existing approaches employ the click-through data for similarity measure of queries with little consideration of the temporal factor, while the click-through data is often dynamic and contains rich temporal information. In this paper we present a new framework of time-dependent query semantic similarity model on exploiting the temporal characteristics of historical click-through data. The intuition is that more accurate semantic similarity values between queries can be obtained by taking into account the timestamps of the log data. With a set of user-defined calendar schema and calendar patterns, our time-dependent query similarity model is constructed using the marginalized kernel technique, which can exploit both explicit similarity and implicit semantics from the click-through data effectively. Experimental results on a large set of click-through data acquired from a commercial search engine show that our time-dependent query similarity model is more accurate than the existing approaches. Moreover, we observe that our time-dependent query similarity model can, to some extent, reflect real-world semantics such as real-world events that are happening over time.}, address = {New York, NY, USA}, author = {Zhao, Qiankun and Hoi, Steven C. H. and Liu, Tie-Yan and Bhowmick, Sourav S. and Lyu, Michael R. and Ma, Wei-Ying}, booktitle = {WWW '06: Proceedings of the 15th international conference on World Wide Web}, doi = {10.1145/1135777.1135858}, interhash = {c765e101c37f6b530e2c1c59808048d7}, intrahash = {57cbc64550d3a1b5b8599a0783e95111}, isbn = {1-59593-323-9}, location = {Edinburgh, Scotland}, pages = {543--552}, publisher = {ACM}, title = {Time-dependent semantic similarity measure of queries using historical click-through data}, url = {http://portal.acm.org/citation.cfm?id=1135777.1135858}, year = 2006 } @inproceedings{1031192, abstract = {The performance of web search engines may often deteriorate due to the diversity and noisy information contained within web pages. User click-through data can be used to introduce more accurate description (metadata) for web pages, and to improve the search performance. However, noise and incompleteness, sparseness, and the volatility of web pages and queries are three major challenges for research work on user click-through log mining. In this paper, we propose a novel iterative reinforced algorithm to utilize the user click-through data to improve search performance. The algorithm fully explores the interrelations between queries and web pages, and effectively finds "virtual queries" for web pages and overcomes the challenges discussed above. Experiment results on a large set of MSN click-through log data show a significant improvement on search performance over the naive query log mining algorithm as well as the baseline search engine.}, address = {New York, NY, USA}, author = {Xue, Gui-Rong and Zeng, Hua-Jun and Chen, Zheng and Yu, Yong and Ma, Wei-Ying and Xi, WenSi and Fan, WeiGuo}, booktitle = {CIKM '04: Proceedings of the thirteenth ACM international conference on Information and knowledge management}, doi = {10.1145/1031171.1031192}, interhash = {31bb65c1b57888b0529c1a11e981bbe8}, intrahash = {2c9841b484ade7e9a8c9220662190c16}, isbn = {1-58113-874-1}, location = {Washington, D.C., USA}, pages = {118--126}, publisher = {ACM}, title = {Optimizing web search using web click-through data}, url = {http://portal.acm.org/citation.cfm?id=1031171.1031192}, year = 2004 } @inproceedings{1135858, abstract = {It has become a promising direction to measure similarity of Web search queries by mining the increasing amount of click-through data logged by Web search engines, which record the interactions between users and the search engines. Most existing approaches employ the click-through data for similarity measure of queries with little consideration of the temporal factor, while the click-through data is often dynamic and contains rich temporal information. In this paper we present a new framework of time-dependent query semantic similarity model on exploiting the temporal characteristics of historical click-through data. The intuition is that more accurate semantic similarity values between queries can be obtained by taking into account the timestamps of the log data. With a set of user-defined calendar schema and calendar patterns, our time-dependent query similarity model is constructed using the marginalized kernel technique, which can exploit both explicit similarity and implicit semantics from the click-through data effectively. Experimental results on a large set of click-through data acquired from a commercial search engine show that our time-dependent query similarity model is more accurate than the existing approaches. Moreover, we observe that our time-dependent query similarity model can, to some extent, reflect real-world semantics such as real-world events that are happening over time.}, address = {New York, NY, USA}, author = {Zhao, Qiankun and Hoi, Steven C. H. and Liu, Tie-Yan and Bhowmick, Sourav S. and Lyu, Michael R. and Ma, Wei-Ying}, booktitle = {WWW '06: Proceedings of the 15th international conference on World Wide Web}, doi = {http://doi.acm.org/10.1145/1135777.1135858}, interhash = {c765e101c37f6b530e2c1c59808048d7}, intrahash = {57cbc64550d3a1b5b8599a0783e95111}, isbn = {1-59593-323-9}, location = {Edinburgh, Scotland}, pages = {543--552}, publisher = {ACM Press}, title = {Time-dependent semantic similarity measure of queries using historical click-through data}, url = {http://portal.acm.org/citation.cfm?id=1135858}, year = 2006 } @inproceedings{1148187, address = {New York, NY, USA}, author = {Feng, Guang and Liu, Tie-Yan and Wang, Ying and Bao, Ying and Ma, Zhiming and Zhang, Xu-Dong and Ma, Wei-Ying}, booktitle = {SIGIR '06: Proceedings of the 29th annual international ACM SIGIR conference on Research and development in information retrieval}, doi = {http://doi.acm.org/10.1145/1148170.1148187}, interhash = {21d1ae55f77976e7d7c97f2d579a0d15}, intrahash = {683a3288d7e356c98ab0d67e9f42d426}, isbn = {1-59593-369-7}, location = {Seattle, Washington, USA}, pages = {75--82}, publisher = {ACM Press}, title = {AggregateRank: bringing order to web sites}, year = 2006 }