@article{984322, abstract = {Language modeling approaches to information retrieval are attractive and promising because they connect the problem of retrieval with that of language model estimation, which has been studied extensively in other application areas such as speech recognition. The basic idea of these approaches is to estimate a language model for each document, and to then rank documents by the likelihood of the query according to the estimated language model. A central issue in language model estimation is smoothing, the problem of adjusting the maximum likelihood estimator to compensate for data sparseness. In this article, we study the problem of language model smoothing and its influence on retrieval performance. We examine the sensitivity of retrieval performance to the smoothing parameters and compare several popular smoothing methods on different test collections. Experimental results show that not only is the retrieval performance generally sensitive to the smoothing parameters, but also the sensitivity pattern is affected by the query type, with performance being more sensitive to smoothing for verbose queries than for keyword queries. Verbose queries also generally require more aggressive smoothing to achieve optimal performance. This suggests that smoothing plays two different role---to make the estimated document language model more accurate and to "explain" the noninformative words in the query. In order to decouple these two distinct roles of smoothing, we propose a two-stage smoothing strategy, which yields better sensitivity patterns and facilitates the setting of smoothing parameters automatically. We further propose methods for estimating the smoothing parameters automatically. Evaluation on five different databases and four types of queries indicates that the two-stage smoothing method with the proposed parameter estimation methods consistently gives retrieval performance that is close to---or better than---the best results achieved using a single smoothing method and exhaustive parameter search on the test data.}, address = {New York, NY, USA}, author = {Zhai, Chengxiang and Lafferty, John}, doi = {http://doi.acm.org/10.1145/984321.984322}, interhash = {4d0acc84788713f07adbe0df3adc92d8}, intrahash = {c7aff853599cdde58a1d27eff4ede314}, issn = {1046-8188}, journal = {ACM Trans. Inf. Syst.}, number = 2, pages = {179--214}, publisher = {ACM Press}, title = {A study of smoothing methods for language models applied to information retrieval}, url = {http://portal.acm.org/citation.cfm?id=984322}, volume = 22, year = 2004 } @article{richter2005metadata, abstract = {During the last decade, the advance of machine-learning tools and algorithms has resulted in tremendous progress in the automated classification of documents. However, many classifiers base their classification decisions solely on document text and ignore metadata (such as authors, publication date, and author affiliation). In this project, automated classifiers using the k-Nearest Neighbour algorithm were developed for the classification of patents into two different classification systems. Those using metadata (in this case inventor names, applicant names and International Patent Classification codes) were compared with those ignoring it. The use of metadata could significantly improve the classification of patents with one classification system, improving classification accuracy from 70.8% up to 75.4%, which was highly statistically significant. However, the results for the other classification system were inconclusive: while metadata could improve the quality of the classifier for some experiments (recall increased from 66.0% to 68.9%, which was a small but nonetheless significant improvement), experiments with different parameters showed that it could also lead to a deterioration of quality (recall dropping as low as 61.0%). The study shows that metadata can play an extremely useful role in the classification of patents. Nonetheless, it must not be used indiscriminately but only after careful evaluation of its usefulness.}, author = {Richter and MacFarlane}, interhash = {c7749092c6e5a90cd43fe022fa398e0b}, intrahash = {d15595d5279e762207d67f2a9b688c37}, journal = {World Patent Information}, pages = {12-26}, title = {The impact of metadata on the accuracy of automated patent classification}, url = {http://dx.doi.org/10.1016/j.wpi.2004.08.001}, volume = 27, year = 2005 }