@article{thelwall2012journal,
  abstract = {In theory, the web has the potential to provide information about the wider impact of academic research, beyond traditional scholarly impact. This is because the web can reflect non-scholarly uses of research, such as in online government documents, press coverage or public discussions. Nevertheless, there are practical problems with creating metrics for journals based on web data: principally that most such metrics should be easy for journal editors or publishers to manipulate. Nevertheless, two alternatives seem to have both promise and value: citations derived from digitised books and download counts for journals within specific delivery platforms.},
  author = {Thelwall, Mike},
  doi = {10.1007/s11192-012-0669-x},
  interhash = {834707cf0663109f7811a14ae746be72},
  intrahash = {284883bbaa636a0bab13fc54b903f363},
  issn = {0138-9130},
  journal = {Scientometrics},
  language = {English},
  number = 2,
  pages = {429--441},
  publisher = {Springer Netherlands},
  title = {Journal impact evaluation: a webometric perspective},
  url = {http://dx.doi.org/10.1007/s11192-012-0669-x},
  volume = 92,
  year = 2012
}

@inproceedings{joachims2005accurately,
  abstract = {This paper examines the reliability of implicit feedback generated from clickthrough data in WWW search. Analyzing the users' decision process using eyetracking and comparing implicit feedback against manual relevance judgments, we conclude that clicks are informative but biased. While this makes the interpretation of clicks as absolute relevance judgments difficult, we show that relative preferences derived from clicks are reasonably accurate on average.},
  acmid = {1076063},
  address = {New York, NY, USA},
  author = {Joachims, Thorsten and Granka, Laura and Pan, Bing and Hembrooke, Helene and Gay, Geri},
  booktitle = {Proceedings of the 28th annual international ACM SIGIR conference on Research and development in information retrieval},
  doi = {10.1145/1076034.1076063},
  interhash = {050982b76855a6b1258ed0b40cb69018},
  intrahash = {8c488477626fa59db419ac77f3552029},
  isbn = {1-59593-034-5},
  location = {Salvador, Brazil},
  numpages = {8},
  pages = {154--161},
  publisher = {ACM},
  title = {Accurately interpreting clickthrough data as implicit feedback},
  url = {http://doi.acm.org/10.1145/1076034.1076063},
  year = 2005
}

@presentation{kohavi2012online,
  abstract = {The web provides an unprecedented opportunity to accelerate innovation by evaluating ideas quickly and accurately using controlled experiments (e.g., A/B tests and their generalizations).   Whether for front-end user-interface changes, or backend recommendation systems and relevance algorithms, online controlled experiments are now utilized to make data-driven decisions at Amazon, Microsoft, eBay, Facebook, Google, Yahoo, Zynga, and at many other companies.  While the theory of a controlled experiment is simple, and dates back to Sir Ronald A. Fisher’s experiments at the Rothamsted Agricultural Experimental Station in England in the 1920s, the deployment and mining of online controlled experiments at scale—thousands of experiments now—has taught us many lessons.  We provide an introduction, share real examples, key learnings, cultural challenges, and humbling statistics. },
  author = {Kohavi, Ron},
  day = 12,
  interhash = {36a473c449c5ede0589c2801781a0579},
  intrahash = {aa31e13651d5d1eab42e449e55a0e745},
  month = sep,
  title = {Online Controlled Experiments: Introduction, Learnings, and Humbling Statistics},
  type = {Industry keynote at ACM Recommender Systems},
  url = {http://www.exp-platform.com/Pages/2012RecSys.aspx},
  year = 2012
}

@article{alonso2008crowdsourcing,
  abstract = {Relevance evaluation is an essential part of the development and maintenance of information retrieval systems. Yet traditional evaluation approaches have several limitations; in particular, conducting new editorial evaluations of a search system can be very expensive. We describe a new approach to evaluation called TERC, based on the crowdsourcing paradigm, in which many online users, drawn from a large community, each performs a small evaluation task.},
  acmid = {1480508},
  address = {New York, NY, USA},
  author = {Alonso, Omar and Rose, Daniel E. and Stewart, Benjamin},
  doi = {10.1145/1480506.1480508},
  interhash = {8441d7fed92813634f61fa148ef2b870},
  intrahash = {4a47833e85558b740788607cb79ba795},
  issn = {0163-5840},
  issue_date = {December 2008},
  journal = {SIGIR Forum},
  month = nov,
  number = 2,
  numpages = {7},
  pages = {9--15},
  publisher = {ACM},
  title = {Crowdsourcing for relevance evaluation},
  url = {http://doi.acm.org/10.1145/1480506.1480508},
  volume = 42,
  year = 2008
}

@article{jarvelin2002cumulated,
  abstract = {Modern large retrieval environments tend to overwhelm their users by their large output. Since all documents are not of equal relevance to their users, highly relevant documents should be identified and ranked first for presentation. In order to develop IR techniques in this direction, it is necessary to develop evaluation approaches and methods that credit IR methods for their ability to retrieve highly relevant documents. This can be done by extending traditional evaluation methods, that is, recall and precision based on binary relevance judgments, to graded relevance judgments. Alternatively, novel measures based on graded relevance judgments may be developed. This article proposes several novel measures that compute the cumulative gain the user obtains by examining the retrieval result up to a given ranked position. The first one accumulates the relevance scores of retrieved documents along the ranked result list. The second one is similar but applies a discount factor to the relevance scores in order to devaluate late-retrieved documents. The third one computes the relative-to-the-ideal performance of IR techniques, based on the cumulative gain they are able to yield. These novel measures are defined and discussed and their use is demonstrated in a case study using TREC data: sample system run results for 20 queries in TREC-7. As a relevance base we used novel graded relevance judgments on a four-point scale. The test results indicate that the proposed measures credit IR methods for their ability to retrieve highly relevant documents and allow testing of statistical significance of effectiveness differences. The graphs based on the measures also provide insight into the performance IR techniques and allow interpretation, for example, from the user point of view.},
  address = {New York, NY, USA},
  author = {Järvelin, Kalervo and Kekäläinen, Jaana},
  doi = {10.1145/582415.582418},
  interhash = {c46348827790803e8e7465ffd1a13376},
  intrahash = {12176d90012ed75f57996af0b9240d02},
  issn = {1046-8188},
  journal = {ACM Transactions on Information Systems},
  month = oct,
  number = 4,
  pages = {422--446},
  publisher = {ACM},
  title = {Cumulated gain-based evaluation of IR techniques},
  url = {http://portal.acm.org/citation.cfm?id=582418},
  volume = 20,
  year = 2002
}

@inproceedings{jarvelin2000ir,
  abstract = {This paper proposes evaluation methods based on the use of non-dichotomous relevance judgements in IR experiments. It is argued that evaluation methods should credit IR methods for their ability to retrieve highly relevant documents. This is desirable from the user point of view in modern large IR environments. The proposed methods are (1) a novel application of P-R curves and average precision computations based on separate recall bases for documents of different degrees of relevance, and (2) two novel measures computing the cumulative gain the user obtains by examining the retrieval result up to a given ranked position. We then demonstrate the use of these evaluation methods in a case study on the effectiveness of query types, based on combinations of query structures and expansion, in retrieving documents of various degrees of relevance. The test was run with a best match retrieval system (In-Query1) in a text database consisting of newspaper articles. The results indicate that the tested strong query structures are most effective in retrieving highly relevant documents. The differences between the query types are practically essential and statistically significant. More generally, the novel evaluation methods and the case demonstrate that non-dichotomous relevance assessments are applicable in IR experiments, may reveal interesting phenomena, and allow harder testing of IR methods.},
  address = {New York, NY, USA},
  author = {Järvelin, Kalervo and Kekäläinen, Jaana},
  booktitle = {SIGIR '00: Proceedings of the 23rd Annual International ACM SIGIR Conference on Research and Development in Information Retrieval},
  doi = {10.1145/345508.345545},
  interhash = {a62a44c48d24acc64cd6713f21111d72},
  intrahash = {12592d5f805db5bd127ee5abae1a4325},
  isbn = {1-58113-226-3},
  location = {Athens, Greece},
  pages = {41--48},
  publisher = {ACM},
  title = {IR evaluation methods for retrieving highly relevant documents},
  url = {http://portal.acm.org/citation.cfm?id=345545},
  year = 2000
}

@mastersthesis{dewit2008evaluating,
  abstract = {It was January 2007 when Dolf Trieschnigg, my supervisor for the course Information Retrieval, first told me about the availability of graduation projects at TNO Information and Communication Technology. This was a bull's eye since I just started to orientate myself on a subject to investigate. I was also looking for a company where I could perform my research since I wanted to get familiar with orking in a professional environment. TNO Information and Communication Technology was one of the companies that seemed interesting to me, so I contacted Stephan Raaijmakers for more information. The subject of the proposed research project, evaluating recommender systems, was completely new to me, but seemed fascinating. And it is. In September 2008 I started my research by crafting the goals and research questions for the project. Now, almost nine months later, the research has resulted in the report that you just started reading. It marks the end of my life as a student (at least for now) and the start of my professional career at TNO. At TNO I can continue to work on personalisation and recommender systems. },
  author = {de Wit, Joost},
  eprintid = {13083},
  howpublished = {http://eprints.eemcs.utwente.nl/13083/},
  interhash = {afd570774d6f14087bee844b27111d9d},
  intrahash = {622d0d0b9e6b289d5200f9ad822f50e2},
  month = may,
  num_pages = {107},
  research_groups = {EWI-DB: Databases},
  school = {University of Twente},
  supervisors = {D. Hiemstra},
  title = {Evaluating Recommender Systems},
  url = {http://eprints.eemcs.utwente.nl/13083/},
  year = 2008
}

@article{voelker2008aeon,
  abstract = {OntoClean is an approach towards the formal evaluation of taxonomic relations in ontologies. The application of OntoClean consists of two main steps. First, concepts are tagged according to meta-properties known as rigidity, unity, dependency and identity. Second, the tagged concepts are checked according to predefined constraints to discover taxonomic errors. Although OntoClean is well documented in numerous publications, it is still used rather infrequently due to the high costs of application. Especially, the manual tagging of concepts with the correct meta-properties requires substantial efforts of highly experienced ontology engineers. In order to facilitate the use of OntoClean and to enable the evaluation of real-world ontologies, we provide AEON, a tool which automatically tags concepts with appropriate OntoClean meta-properties and performs the constraint checking. We use the Web as an embodiment of world knowledge, where we search for patterns that indicate how to properly tag concepts. We thoroughly evaluated our approach against a manually created gold standard. The evaluation shows the competitiveness of our approach while at the same time significantly lowering the costs. All of our results, i.e. the tool AEON as well as the experiment data, are publicly available.},
  address = {Amsterdam, The Netherlands, The Netherlands},
  author = {Völker, Johanna and Vrandečić, Denny and Sure, York and Hotho, Andreas},
  interhash = {f14794f4961d0127dc50c1938eaef7ea},
  intrahash = {f8f0bb3e3495e7627770b470d1a5f1a3},
  issn = {1570-5838},
  journal = {Applied Ontology},
  number = {1-2},
  pages = {41--62},
  publisher = {IOS Press},
  title = {AEON - An approach to the automatic evaluation of ontologies},
  url = {http://portal.acm.org/citation.cfm?id=1412422},
  volume = 3,
  year = 2008
}

@inproceedings{davis2006relationship,
  abstract = {Receiver Operator Characteristic (ROC) curves are commonly used to present results for binary decision problems in machine learning. However, when dealing with highly skewed datasets, Precision-Recall (PR) curves give a more informative picture of an algorithm's performance. We show that a deep connection exists between ROC space and PR space, such that a curve dominates in ROC space if and only if it dominates in PR space. A corollary is the notion of an achievable PR curve, which has properties much like the convex hull in ROC space; we show an efficient algorithm for computing this curve. Finally, we also note differences in the two types of curves are significant for algorithm design. For example, in PR space it is incorrect to linearly interpolate between points. Furthermore, algorithms that optimize the area under the ROC curve are not guaranteed to optimize the area under the PR curve.},
  address = {New York, NY, USA},
  author = {Davis, Jesse and Goadrich, Mark},
  booktitle = {ICML '06: Proceedings of the 23rd international conference on Machine learning},
  doi = {http://doi.acm.org/10.1145/1143844.1143874},
  interhash = {e4ea92aea3ff8bbb3eb04c64867505f2},
  intrahash = {4cc51d680241bab2326e28dfea42c9ea},
  isbn = {1-59593-383-2},
  location = {Pittsburgh, Pennsylvania},
  pages = {233--240},
  publisher = {ACM},
  title = {The relationship between Precision-Recall and ROC curves},
  url = {http://portal.acm.org/citation.cfm?id=1143874},
  year = 2006
}

@inproceedings{lewis1991evaluating,
  abstract = {While certain standard procedures are widely used for evaluating text retrieval systems and algorithms, the same is not true for text categorization. Omission of important data from reports is common and methods of measuring effectiveness vary widely. This has made judging the relative merits of techniques for text categorization difficult and has disguised important research issues. In this paper I discuss a variety of ways of evaluating the effectiveness of text categorization systems, drawing both on reported categorization experiments and on methods used in evaluating query-driven retrieval. I also consider the extent to which the same evaluation methods may be used with systems for text extraction, a more complex task. In evaluating either kind of system, the purpose for which the output is to be used is crucial in choosing appropriate evaluation methods.},
  address = {San Mateo},
  author = {Lewis, David D.},
  booktitle = {Proceedings of Speech and Natural Language Workshop},
  interhash = {a9c64235f49e18a6b80c306b61ff40c2},
  intrahash = {2e8f19bde0a73d96d16c071b2016073f},
  month = Feb,
  pages = {312-318},
  publisher = {Morgan Kaufmann},
  title = {Evaluating text categorization},
  url = {http://citeseerx.ist.psu.edu/viewdoc/download;jsessionid=FB1ECC14412DFFF631E7C0725D4DB3CC?doi=10.1.1.56.9675&rep=rep1&type=pdf},
  year = 1991
}

@article{herlocker2004evaluating,
  abstract = {Recommender systems have been evaluated in many, often incomparable, ways. In this article, we review the key decisions in evaluating collaborative filtering recommender systems: the user tasks being evaluated, the types of analysis and datasets being used, the ways in which prediction quality is measured, the evaluation of prediction attributes other than quality, and the user-based evaluation of the system as a whole. In addition to reviewing the evaluation strategies used by prior researchers, we present empirical results from the analysis of various accuracy metrics on one content domain where all the tested metrics collapsed roughly into three equivalence classes. Metrics within each equivalency class were strongly correlated, while metrics from different equivalency classes were uncorrelated.},
  address = {New York, NY, USA},
  author = {Herlocker, Jonathan L. and Konstan, Joseph A. and Terveen, Loren G. and Riedl, John T.},
  doi = {http://doi.acm.org/10.1145/963770.963772},
  interhash = {f8a70731d983634ac7105896d101c9d2},
  intrahash = {bdd3980bb3c297d1b84ceb0c7729d397},
  issn = {1046-8188},
  journal = {ACM Trans. Inf. Syst.},
  number = 1,
  pages = {5--53},
  publisher = {ACM Press},
  title = {Evaluating collaborative filtering recommender systems},
  url = {http://portal.acm.org/citation.cfm?id=963770.963772},
  volume = 22,
  year = 2004
}