@inproceedings{weikum2011longitudinal, abstract = {Organizations like the Internet Archive have been capturing Web contents over decades, building up huge repositories of time-versioned pages. The timestamp annotations and the sheer volume of multi-modal content constitutes a gold mine for analysts of all sorts, across diff�erent application areas, from political analysts and marketing agencies to academic researchers and product developers. In contrast to traditional data analytics on click logs, the focus is on longitudinal studies over very long horizons. This longitudinal aspect affects and concerns all data and metadata, from the content itself, to the indices and the statistical metadata maintained for it. Moreover, advanced analysts prefer to deal with semantically rich entities like people, places, organizations, and ideally relationships such as company acquisitions, instead of, say, Web pages containing such references. For example, tracking and analyzing a politician's public appearances over a decade is much harder than mining frequently used query words or frequently clicked URLs for the last month. The huge size of Web archives adds to the complexity of this daunting task. This paper discusses key challenges, that we intend to take up, which are posed by this kind of longitudinal analytics: time-travel indexing and querying, entity detection and tracking along the time axis, algorithms for advanced analyses and knowledge discovery, and scalability and platform issues.}, author = {Weikum, Gerhard and Ntarmos, Nikos and Spaniol, Marc and Triantafillou, Peter and Benczúr, András and Kirkpatrick, Scott and Rigaux, Philippe and Williamson, Mark}, booktitle = {Proceedings of the 5th Biennial Conference on Innovative Data Systems Research}, interhash = {2d84fdbf82a84bfc557056df3d0dcf11}, intrahash = {6ffcc0d793bbe53bf6ed17f9d929846e}, month = jan, pages = {199--202}, title = {Longitudinal Analytics on Web Archive Data: It's About Time!}, url = {http://www.cidrdb.org/cidr2011/Papers/CIDR11_Paper26.pdf}, year = 2011 } @inproceedings{benczur2008survey, abstract = {While Web archive quality is endangered by Web spam, a side effect of the high commercial value of top-ranked search-engine results, so farWeb spam filtering technologies are rarely used byWeb archivists. In this paper we make the first attempt to disseminate existing methodology and envision a solution for Web archives to share knowledge and unite efforts in Web spam hunting. We survey the state of the art inWeb spam filtering illustrated by the recent Web spam challenge data sets and techniques and describe the filtering solution for archives envisioned in the LiWA—Living Web Archives project.}, address = {Aaarhus, Denmark}, author = {Benczúr, András A. and Siklósi, Dávid and Szabó, Jácint and Bíró, István and Fekete, Zsolt and and Miklós Kurucz and Pereszlényi, Attila and Rácz, Simon and Szabó, Adrienn}, booktitle = {Proceedings of the 8th International Web Archiving Workshop IWAW'08}, interhash = {b09d09a4d29ba2a80a5a29b9a76ed5f0}, intrahash = {911a912a75e50451923522223f7717e8}, month = sep, title = {Web Spam: a Survey with Vision for the Archivist}, url = {http://iwaw.europarchive.org/08/IWAW2008-Benczur.pdf}, year = 2008 }