Garbin, E. & Mani, I.
(2005):
Disambiguating toponyms in news.
In: Proceedings of the conference on Human Language Technology and Empirical Methods in Natural Language Processing,
Stroudsburg, PA, USA.
[Volltext]
[Kurzfassung] [BibTeX][Endnote]
This research is aimed at the problem of disambiguating toponyms (place names) in terms of a classification derived by merging information from two publicly available gazetteers. To establish the difficulty of the problem, we measured the degree of ambiguity, with respect to a gazetteer, for toponyms in news. We found that 67.82% of the toponyms found in a corpus that were ambiguous in a gazetteer lacked a local discriminator in the text. Given the scarcity of human-annotated data, our method used unsupervised machine learning to develop disambiguation rules. Toponyms were automatically tagged with information about them found in a gazetteer. A toponym that was ambiguous in the gazetteer was automatically disambiguated based on preference heuristics. This automatically tagged data was used to train a machine learner, which disambiguated toponyms in a human-annotated news corpus at 78.5% accuracy.
@inproceedings{garbin2005disambiguating,
author = {Garbin, Eric and Mani, Inderjeet},
title = {Disambiguating toponyms in news},
booktitle = {Proceedings of the conference on Human Language Technology and Empirical Methods in Natural Language Processing},
publisher = {Association for Computational Linguistics},
address = {Stroudsburg, PA, USA},
year = {2005},
pages = {363--370},
url = {http://dx.doi.org/10.3115/1220575.1220621},
doi = {10.3115/1220575.1220621},
keywords = {geo, toponym, news, disambiguation, map, extraction},
abstract = {This research is aimed at the problem of disambiguating toponyms (place names) in terms of a classification derived by merging information from two publicly available gazetteers. To establish the difficulty of the problem, we measured the degree of ambiguity, with respect to a gazetteer, for toponyms in news. We found that 67.82% of the toponyms found in a corpus that were ambiguous in a gazetteer lacked a local discriminator in the text. Given the scarcity of human-annotated data, our method used unsupervised machine learning to develop disambiguation rules. Toponyms were automatically tagged with information about them found in a gazetteer. A toponym that was ambiguous in the gazetteer was automatically disambiguated based on preference heuristics. This automatically tagged data was used to train a machine learner, which disambiguated toponyms in a human-annotated news corpus at 78.5% accuracy.}
}
%0 = inproceedings
%A = Garbin, Eric and Mani, Inderjeet
%B = Proceedings of the conference on Human Language Technology and Empirical Methods in Natural Language Processing
%C = Stroudsburg, PA, USA
%D = 2005
%I = Association for Computational Linguistics
%T = Disambiguating toponyms in news
%U = http://dx.doi.org/10.3115/1220575.1220621