@inproceedings{Adda98thegrace,
  author = {Adda, Gilles and Mariani, Joseph and Lecomte, Josette and Paroubek, Patrick and Rajman, Martin},
  booktitle = {proceedings of the First International Conference on Language Resources and Evaluation (LREC},
  interhash = {3237c9df89a7a80dfc951905e7bfac3e},
  intrahash = {8fdfe8f12ea1762129195d3d8f1e5581},
  pages = {433--441},
  title = {The GRACE French Part-of-Speech Tagging Evaluation Task},
  year = 1998
}

@inproceedings{davis2006relationship,
  abstract = {Receiver Operator Characteristic (ROC) curves are commonly used to present results for binary decision problems in machine learning. However, when dealing with highly skewed datasets, Precision-Recall (PR) curves give a more informative picture of an algorithm's performance. We show that a deep connection exists between ROC space and PR space, such that a curve dominates in ROC space if and only if it dominates in PR space. A corollary is the notion of an achievable PR curve, which has properties much like the convex hull in ROC space; we show an efficient algorithm for computing this curve. Finally, we also note differences in the two types of curves are significant for algorithm design. For example, in PR space it is incorrect to linearly interpolate between points. Furthermore, algorithms that optimize the area under the ROC curve are not guaranteed to optimize the area under the PR curve.},
  address = {New York, NY, USA},
  author = {Davis, Jesse and Goadrich, Mark},
  booktitle = {ICML '06: Proceedings of the 23rd international conference on Machine learning},
  doi = {http://doi.acm.org/10.1145/1143844.1143874},
  interhash = {e4ea92aea3ff8bbb3eb04c64867505f2},
  intrahash = {4cc51d680241bab2326e28dfea42c9ea},
  isbn = {1-59593-383-2},
  location = {Pittsburgh, Pennsylvania},
  pages = {233--240},
  publisher = {ACM},
  title = {The relationship between Precision-Recall and ROC curves},
  url = {http://portal.acm.org/citation.cfm?id=1143874},
  year = 2006
}

@inproceedings{lewis1991evaluating,
  abstract = {While certain standard procedures are widely used for evaluating text retrieval systems and algorithms, the same is not true for text categorization. Omission of important data from reports is common and methods of measuring effectiveness vary widely. This has made judging the relative merits of techniques for text categorization difficult and has disguised important research issues. In this paper I discuss a variety of ways of evaluating the effectiveness of text categorization systems, drawing both on reported categorization experiments and on methods used in evaluating query-driven retrieval. I also consider the extent to which the same evaluation methods may be used with systems for text extraction, a more complex task. In evaluating either kind of system, the purpose for which the output is to be used is crucial in choosing appropriate evaluation methods.},
  address = {San Mateo},
  author = {Lewis, David D.},
  booktitle = {Proceedings of Speech and Natural Language Workshop},
  interhash = {a9c64235f49e18a6b80c306b61ff40c2},
  intrahash = {2e8f19bde0a73d96d16c071b2016073f},
  month = Feb,
  pages = {312-318},
  publisher = {Morgan Kaufmann},
  title = {Evaluating text categorization},
  url = {http://citeseerx.ist.psu.edu/viewdoc/download;jsessionid=FB1ECC14412DFFF631E7C0725D4DB3CC?doi=10.1.1.56.9675&rep=rep1&type=pdf},
  year = 1991
}