@techreport{group2006cases, abstract = {Archives of material published on the Internet are still in their infancy; the oldest archive (www.archive.org) is a mere 10 years old. In order to gain a better understanding of the expectations and requirements users of Internet archives are likely to have, this report identifies and specifies a number of possible usage scenarios of an Internet archive. Attempts at predicting usage of new technologies are by nature uncertain and likely to miss important applications. The list of usage scenarios presented in this report should thus be considered an opening statement in an ongoing dialogue with relevant user groups. It must always be considered that access will very much depend on the legal foundation of each web archive. This report, conducted by the Access Tools Working Group of the IIPC, relates to the Prototype report that is based on some of the usage scenarios and attempts to illustrate what interfaces may be needed by developing mock-up prototypes of the functional components suggested by the use cases.}, author = {{Access Tools Working Group}}, institution = {International Internet Preservation Consortium}, interhash = {e80a41d4c512ded4c37b4618bde86e1f}, intrahash = {55f72dcd8e1b4a526023fe861ce90f47}, month = may, title = {Use Cases for Access to Internet Archives}, url = {http://www.netpreserve.org/resources/use-cases-access-internet-archives}, year = 2006 } @inproceedings{pal2012information, abstract = {Often an interesting true value such as a stock price, sports score, or current temperature is only available via the observations of noisy and potentially conflicting sources. Several techniques have been proposed to reconcile these conflicts by computing a weighted consensus based on source reliabilities, but these techniques focus on static values. When the real-world entity evolves over time, the noisy sources can delay, or even miss, reporting some of the real-world updates. This temporal aspect introduces two key challenges for consensus-based approaches: (i) due to delays, the mapping between a source's noisy observation and the real-world update it observes is unknown, and (ii) missed updates may translate to missing values for the consensus problem, even if the mapping is known. To overcome these challenges, we propose a formal approach that models the history of updates of the real-world entity as a hidden semi-Markovian process (HSMM). The noisy sources are modeled as observations of the hidden state, but the mapping between a hidden state (i.e. real-world update) and the observation (i.e. source value) is unknown. We propose algorithms based on Gibbs Sampling and EM to jointly infer both the history of real-world updates as well as the unknown mapping between them and the source values. We demonstrate using experiments on real-world datasets how our history-based techniques improve upon history-agnostic consensus-based approaches.}, acmid = {2187943}, address = {New York, NY, USA}, author = {Pal, Aditya and Rastogi, Vibhor and Machanavajjhala, Ashwin and Bohannon, Philip}, booktitle = {Proceedings of the 21st international conference on World Wide Web}, doi = {10.1145/2187836.2187943}, interhash = {0dc613c467921aff444e1e33a9aeace4}, intrahash = {3d334b8a497f199c27d56629e000f3cc}, isbn = {978-1-4503-1229-5}, location = {Lyon, France}, numpages = {10}, pages = {789--798}, publisher = {ACM}, title = {Information integration over time in unreliable and uncertain environments}, url = {http://doi.acm.org/10.1145/2187836.2187943}, year = 2012 } @inproceedings{li2012linking, abstract = {In real-world, entities change dynamically and the changes are capture in two dimensions: time and space. For data sets that contain temporal records, where each record is associated with a time stamp and describes some aspects of a real-world entity at that particular time, we often wish to identify records that describe the same entity over time and so be able to enable interesting longitudinal data analysis. For data sets that contain geographically referenced data describing real-world entities at different locations (i.e., location entities), we wish to link those entities that belong to the same organization or network. However, existing record linkage techniques ignore additional evidence in temporal and spatial data and can fall short for these cases.

This proposal studies linking temporal and spatial records. For temporal record linkage, we apply time decay to capture the effect of elapsed time on entity value evolution, and propose clustering methods that consider time order of records in clustering. For linking location records, we distinguish between strong and weak evidence; for the former, we study core generation in presence of erroneous data, and then leverage the discovered strong evidence to make remaining decisions.}, acmid = {2213612}, address = {New York, NY, USA}, author = {Li, Pei}, booktitle = {Proceedings of the on SIGMOD/PODS 2012 PhD Symposium}, doi = {10.1145/2213598.2213612}, interhash = {91dc73d16f9ebbbaec416db6333aa2f5}, intrahash = {18b8c7bcd4398f5502225bca23158f6e}, isbn = {978-1-4503-1326-1}, location = {Scottsdale, Arizona, USA}, numpages = {6}, pages = {51--56}, publisher = {ACM}, title = {Linking records in dynamic world}, url = {http://doi.acm.org/10.1145/2213598.2213612}, year = 2012 } @article{li2011linking, abstract = {Many data sets contain temporal records over a long period of time; each record is associated with a time stamp and describes some aspects of a realworld entity at that particular time (e.g., author information in DBLP). In such cases, we often wish to identify records that describe the same entity over time and so be able to enable interesting longitudinal data analysis. However, existing record linkage techniques ignore the temporal information and can fall short for temporal data. This paper studies linking temporal records. First, we apply time decay to capture the effect of elapsed time on entity value evolution. Second, instead of comparing each pair of records locally, we propose clustering methods that consider time order of the records and make global decisions. Experimental results show that our algorithms significantly outperform traditional linkage methods on various temporal data sets.}, author = {Li, P. and Luna Dong, X. and Maurino, A. and Srivastava, D.}, interhash = {0d8151346fd512743809aa0cfe591955}, intrahash = {85be46ab943802120277be8f8b6b264b}, issn = {2150-8097}, journal = {Proceedings of the VLDB Endowment}, month = aug, number = 11, pages = {956--967}, title = {Linking Temporal Records}, url = {http://hdl.handle.net/10281/28587}, volume = 4, year = 2011 } @techreport{gomes2012creating, abstract = {The web became a mass means of publication that has been replacing printed media. However, its information is extremely ephemeral. Currently, most of the information available on the web is less than 1 year old. There are several initiatives worldwide that struggle to archive information from the web before it vanishes. However, search mechanisms to access this information are still limited and do not satisfy their users that demand performance similar to live- web search engines. This paper presents some of the work developed to create an effi�cient and effective searchable web archive service, from data acquisition to user interface design. The results of research were applied in practice to create the Portuguese Web Archive that is publicly available since January 2010. It supports full-text search over 1 billion contents archived from 1996 to 2010. The developed software is available as an open source project.}, address = {Portugal}, author = {Gomes, Daniel and Cruz, David and Miranda, João and Costa, Miguel and Fontes, Simão}, institution = {Foundation for National Scientific Computing}, interhash = {b5c01e5cadcc1d8ef44d48b2022144d2}, intrahash = {da5b8a339b2c3d765c3b0a7bd025af82}, month = may, title = {Creating a searchable web archive}, url = {http://web.ist.utl.pt/joaocarvalhomiranda/docs/other/creating-a-searchable-web-archive-relatorio.pdf}, year = 2012 } @inproceedings{weikum2011longitudinal, abstract = {Organizations like the Internet Archive have been capturing Web contents over decades, building up huge repositories of time-versioned pages. The timestamp annotations and the sheer volume of multi-modal content constitutes a gold mine for analysts of all sorts, across diff�erent application areas, from political analysts and marketing agencies to academic researchers and product developers. In contrast to traditional data analytics on click logs, the focus is on longitudinal studies over very long horizons. This longitudinal aspect affects and concerns all data and metadata, from the content itself, to the indices and the statistical metadata maintained for it. Moreover, advanced analysts prefer to deal with semantically rich entities like people, places, organizations, and ideally relationships such as company acquisitions, instead of, say, Web pages containing such references. For example, tracking and analyzing a politician's public appearances over a decade is much harder than mining frequently used query words or frequently clicked URLs for the last month. The huge size of Web archives adds to the complexity of this daunting task. This paper discusses key challenges, that we intend to take up, which are posed by this kind of longitudinal analytics: time-travel indexing and querying, entity detection and tracking along the time axis, algorithms for advanced analyses and knowledge discovery, and scalability and platform issues.}, author = {Weikum, Gerhard and Ntarmos, Nikos and Spaniol, Marc and Triantafillou, Peter and Benczúr, András and Kirkpatrick, Scott and Rigaux, Philippe and Williamson, Mark}, booktitle = {Proceedings of the 5th Biennial Conference on Innovative Data Systems Research}, interhash = {2d84fdbf82a84bfc557056df3d0dcf11}, intrahash = {6ffcc0d793bbe53bf6ed17f9d929846e}, month = jan, pages = {199--202}, title = {Longitudinal Analytics on Web Archive Data: It's About Time!}, url = {http://www.cidrdb.org/cidr2011/Papers/CIDR11_Paper26.pdf}, year = 2011 } @article{rauber2009webarchivierung, abstract = { In den letzten Jahren haben Bibliotheken und Archive zunehmend die Aufgabe übernommen, neben konventionellen Publikationen auch Inhalte aus dem World Wide Web zu sammeln, um so diesen wertvollen Teil unseres kulturellen Erbes zu bewahren und wichtige Informationen langfristig verfügbar zu halten. Diese massiven Datensammlungen bieten faszinierende Möglichkeiten, rasch Zugriff auf wichtige Informationen zu bekommen, die im Live-Web bereits verloren gegangen sind. Sie sind eine unentbehrliche Quelle für Wissenschaftler, die in der Zukunft die gesellschaftliche und technologische Entwicklung unserer Zeit nachvollziehen wollen. Auf der anderen Seite stellt eine derartige Datensammlung aber einen völlig neuen Datenbestand dar, der nicht nur rechtliche, sondern auch zahlreiche ethische Fragen betreffend seine Nutzung aufwirft. Diese werden in dem Ausmaß zunehmen, in dem die technischen Möglichkeiten zur automatischen Analyse und Interpretation dieser Daten leistungsfähiger werden. Da sich die meisten Webarchivierungsinitiativen dieser Problematik bewusst sind, bleibt die Nutzung der Daten derzeit meist stark eingeschränkt, oder es wird eine Art von "Opt-Out"-Möglichkeit vorgesehen, wodurch Webseiteninhaber die Aufnahme ihrer Seiten in ein Webarchiv ausschließen können. Mit beiden Ansätzen können Webarchive ihr volles Nutzungspotenzial nicht ausschöpfen. Dieser Artikel beschreibt einleitend kurz die Technologien, die zur Sammlung von Webinhalten zu Archivierungszwecken verwendet werden. Er hinterfragt Annahmen, die die freie Verfügbarkeit der Daten und unterschiedliche Nutzungsarten betreffen. Darauf aufbauend identifiziert er eine Reihe von offenen Fragen, deren Lösung einen breiteren Zugriff und bessere Nutzung von Webarchiven erlauben könnte. }, author = {Rauber, Andreas and Kaiser, Max}, editor = {Knoll, Matthias and Meier, Andreas}, interhash = {3b35b676a2817868d93481aeebfa4154}, intrahash = {cdaef18169a7d8300cf54daf018a74cc}, issn = {1436-3011}, journal = {HMD Praxis der Wirtschaftsinformatik}, month = aug, publisher = {dpunkt.verlag}, title = {Webarchivierung und Web Archive Mining: Notwendigkeit, Probleme und Lösungsansätze}, url = {http://hmd.dpunkt.de/268/03.php}, volume = 268, year = 2009 } @article{cho2006stanford, abstract = {We describe the design and performance of WebBase, a tool for Web research. The system includes a highly customizable crawler, a repository for collected Web pages, an indexer for both text and link-related page features, and a high-speed content distribution facility. The distribution module enables researchers world-wide to retrieve pages from WebBase, and stream them across the Internet at high speed. The advantage for the researchers is that they need not all crawl the Web before beginning their research. WebBase has been used by scores of research and teaching organizations world-wide, mostly for investigations into Web topology and linguistic content analysis. After describing the system's architecture, we explain our engineering decisions for each of the WebBase components, and present respective performance measurements.}, acmid = {1149124}, address = {New York, NY, USA}, author = {Cho, Junghoo and Garcia-Molina, Hector and Haveliwala, Taher and Lam, Wang and Paepcke, Andreas and Raghavan, Sriram and Wesley, Gary}, doi = {10.1145/1149121.1149124}, interhash = {bebbc072ea2dccf4c2b27abf244c1f08}, intrahash = {3cd21bf8a87619e0489b8da177c9f0b4}, issn = {1533-5399}, issue_date = {May 2006}, journal = {ACM Transactions on Internet Technology}, month = may, number = 2, numpages = {34}, pages = {153--186}, publisher = {ACM}, title = {Stanford WebBase components and applications}, url = {http://doi.acm.org/10.1145/1149121.1149124}, volume = 6, year = 2006 } @article{costa2005distributed, abstract = {Sidra is a new indexing and ranking system for large-scale Web collections. Sidra creates multiple distributed indexes, organized and partitioned by different ranking criteria, aimed at supporting contextualized queries over hypertexts and their metadata. This paper presents the architecture of Sidra and the algorithms used to create its indexes. Performance measurements on the Portuguese Web data show that Sidra's indexing times and scalability are comparable to those of global Web search engines.}, author = {Costa, M. and Silva, M.}, doi = {10.1109/TLA.2005.1468656}, interhash = {aa19ebf217dc3ff7b18d4b4f18ef0f21}, intrahash = {c927cddd9fc8fb672198081f868e4efb}, issn = {1548-0992}, journal = {IEEE Latin America Transactions}, month = mar, number = 1, pages = {2-8}, title = {Distributed Indexing of Large-Scale Web Collections}, url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=1468656}, volume = 3, year = 2005 } @article{toyoda2012history, abstract = {This paper describes the history and the current challenges of archiving massive and extremely diverse amounts of user-generated data in an international environment on the World Wide Web and the technologies required for interoperability between service providers and for preserving their contents in the future.}, author = {Toyoda, M. and Kitsuregawa, M.}, doi = {10.1109/JPROC.2012.2189920}, interhash = {5f4e000394b30536cc056b2301e12652}, intrahash = {92a49a70a925a09e86ec6a6d67d8b2ae}, issn = {0018-9219}, journal = {Proceedings of the IEEE}, month = {13}, number = {Special Centennial Issue}, pages = {144--1443}, title = {The History of Web Archiving}, url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=6182575&tag=1}, volume = 100, year = 2012 } @inproceedings{chung2009study, abstract = {In this paper, we study the overall link-based spam structure and its evolution which would be helpful for the development of robust analysis tools and research for Web spamming as a social activity in the cyber space. First, we use strongly connected component (SCC) decomposition to separate many link farms from the largest SCC, so called the core. We show that denser link farms in the core can be extracted by node filtering and recursive application of SCC decomposition to the core. Surprisingly, we can find new large link farms during each iteration and this trend continues until at least 10 iterations. In addition, we measure the spamicity of such link farms. Next, the evolution of link farms is examined over two years. Results show that almost all large link farms do not grow anymore while some of them shrink, and many large link farms are created in one year.}, acmid = {1531917}, address = {New York, NY, USA}, author = {Chung, Young-joo and Toyoda, Masashi and Kitsuregawa, Masaru}, booktitle = {Proceedings of the 5th International Workshop on Adversarial Information Retrieval on the Web}, doi = {10.1145/1531914.1531917}, interhash = {1a0eff19d17ebb60bcbc4f7c6ccc460a}, intrahash = {d36b62eab48a830b4ac9da825a4929a5}, isbn = {978-1-60558-438-6}, location = {Madrid, Spain}, numpages = {8}, pages = {9--16}, publisher = {ACM}, title = {A study of link farm distribution and evolution using a time series of web snapshots}, url = {http://doi.acm.org/10.1145/1531914.1531917}, year = 2009 } @incollection{kitsuregawa2008sociosense, abstract = {We introduce Socio-Sense Web analysis system. The system applies structural and temporal analysis methods to long term Web archive to obtain insight into the real society. We present an overview of the system and core methods followed by excerpts from case studies on consumer behavior analyses.}, address = {Berlin/Heidelberg}, affiliation = {The University of Tokyo Institute of Industrial Science 4–6–1 Komaba Meguro-ku Tokyo 153-8505 Japan}, author = {Kitsuregawa, Masaru and Tamura, Takayuki and Toyoda, Masashi and Kaji, Nobuhiro}, booktitle = {Progress in WWW Research and Development}, doi = {10.1007/978-3-540-78849-2_1}, editor = {Zhang, Yanchun and Yu, Ge and Bertino, Elisa and Xu, Guandong}, interhash = {a0ac63893d45095766b0f5fc8fd78139}, intrahash = {76f35c47a4b65f229269ea9ea39829d8}, isbn = {978-3-540-78848-5}, keyword = {Computer Science}, pages = {1--8}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, title = {Socio-Sense: A System for Analysing the Societal Behavior from Long Term Web Archive}, url = {http://dx.doi.org/10.1007/978-3-540-78849-2_1}, volume = 4976, year = 2008 } @incollection{lyman2000archiving, address = {Washington, D.C.}, author = {Lyman, Peter}, booktitle = {Building a National Strategy for Preservation: Issues in Digital Media Archiving}, interhash = {023b41bc68b3d536d084be78fa877f4c}, intrahash = {9028c7602661051270e12efe9ae1c362}, isbn = {1-887334-91-2}, month = apr, pages = {38--51}, publisher = {Council on Library and Information Resources Washington, D.C. and Library of Congress}, title = {Archiving the World Wide Web}, url = {http://www.clir.org/pubs/reports/pub106/web.html}, year = 2002 } @article{stirling2012archives, abstract = {The Internet has been covered by legal deposit legislation in France since 2006, making web archiving one of the missions of the Bibliothèque nationale de France (BnF). Access to the web archives has been provided in the library on an experimental basis since 2008. In the context of increasing interest in many countries in web archiving and how it may best serve the needs of researchers, especially in the expanding field of Internet studies for social sciences, a qualitative study was performed, based on interviews with potential users of the web archives held at the BnF, and particularly researchers working in various areas related to the Internet. The study aimed to explore their needs in terms of both content and services, and also to analyse different ways of representing the archives, in order to identify ways of increasing their use. While the interest of maintaining the "memory" of the web is obvious to the researchers, they are faced with the difficulty of defining, in what is a seemingly limitless space, meaningful collections of documents. Cultural heritage institutions such as national libraries are perceived as trusted third parties capable of creating rationally-constructed and well-documented collections, but such archives raise certain ethical and methodological questions.}, author = {Stirling, Peter and Chevallier, Philippe and Illien, Gildas}, doi = {10.1045/march2012-stirling}, interhash = {a783191c99a285197525595ebf509bb2}, intrahash = {4f7840193e7e435ad5dd0003fc93691a}, issn = {1082-9873}, journal = {D-Lib Magazine}, month = {March/April }, number = {3/4}, title = {Web Archives for Researchers: Representations, Expectations and Potential Uses}, url = {http://www.dlib.org/dlib/march12/stirling/03stirling.html}, volume = 18, year = 2012 } @inproceedings{benczur2008survey, abstract = {While Web archive quality is endangered by Web spam, a side effect of the high commercial value of top-ranked search-engine results, so farWeb spam filtering technologies are rarely used byWeb archivists. In this paper we make the first attempt to disseminate existing methodology and envision a solution for Web archives to share knowledge and unite efforts in Web spam hunting. We survey the state of the art inWeb spam filtering illustrated by the recent Web spam challenge data sets and techniques and describe the filtering solution for archives envisioned in the LiWA—Living Web Archives project.}, address = {Aaarhus, Denmark}, author = {Benczúr, András A. and Siklósi, Dávid and Szabó, Jácint and Bíró, István and Fekete, Zsolt and and Miklós Kurucz and Pereszlényi, Attila and Rácz, Simon and Szabó, Adrienn}, booktitle = {Proceedings of the 8th International Web Archiving Workshop IWAW'08}, interhash = {b09d09a4d29ba2a80a5a29b9a76ed5f0}, intrahash = {911a912a75e50451923522223f7717e8}, month = sep, title = {Web Spam: a Survey with Vision for the Archivist}, url = {http://iwaw.europarchive.org/08/IWAW2008-Benczur.pdf}, year = 2008 } @inproceedings{sigurdsson2005incremental, abstract = {The Heritrix web crawler aims to be the world's first open source, extensible, web-scale, archival-quality web crawler. It has however been limited in its crawling strategies to snapshot crawling. This paper reports on work to add the ability to conduct incremental crawls to its capabilities. We first discuss the concept of incremental crawling as opposed to snapshot crawling and then the possible ways to design an effective incremental strategy. An overview is given of the implementation that we did, its limits and strengths are discussed. We then report on the results of initial experimentation with the new software which have gone well. Finally, we discuss issues that remain unresolved and possible future improvements.}, address = {Vienna, Austria}, author = {Sigurðsson, Kristinn}, booktitle = {Proceedings of the 5th International Web Archiving Workshop (IWAW’05)}, interhash = {d84cf76a7001d472bd27dd092c0e1357}, intrahash = {1065880693b176515b5001af844e251f}, title = {Incremental crawling with Heretrix}, url = {http://iwaw.europarchive.org/05/papers/iwaw05-sigurdsson.pdf}, year = 2005 } @inproceedings{mohr2004introduction, abstract = {Heritrix is the Internet Archive's open-source, extensible, web-scale, archival-quality webcrawler project. The Internet Archive started Heritrix development in the early part of 2003. The intention was to develop a crawler for the specific purpose of archiving websites and to support multiple different use cases including focused and broadcrawling. The software is open source to encourage collaboration and joint development across institutions with similar needs. A pluggable, extensible architecture facilitates customization and outside contribution. Now, after over a year of development, the Internet Archive and other institutions are using Heritrix to perform focused and increasingly broad crawls.}, address = {Bath, UK}, author = {Mohr, G. and Kimpton, M. and Stack, M. and Ranitovic, I.}, booktitle = {Proceedings of the 4th International Web Archiving Workshop IWAW'04}, interhash = {4d9fda8f3428384167ee23949442643d}, intrahash = {09d70d4ea1810fe89522755a0982169f}, month = jul, title = {Introduction to heritrix, an archival quality web crawler}, url = {http://crawler.archive.org/Mohr-et-al-2004.pdf}, year = 2004 } @article{burner1997crawling, abstract = { Crawling towards Eternity Building An Archive of The World Wide Web By Mike Burner When you ask what he's up to, Brewster Kahle likes to surprise you. So when designing supercomputers became cliche — and everybody was developing new technologies for full-text indexing — Brewster decided to archive the Internet. He thought the Internet would fit nicely into a box. The box he had in mind was a tape robot that, if it held enough tapes, could hold terabytes of data. Into this box, Brewster wanted to cram all of the publicly accessible data from the World Wide Web, anonymous FTP sites, USENET news, and public gopher sites. His vision was that this would become an archive of digital history, available forever as a research tool and time capsule. By visiting this "library," the world would be able to trace the development of technologies, styles, and cultural trends. Brewster knew it would be necessary to transcribe the archive on a continuing basis, lest the media deteriorate or the ability to read it disappear. Thus was born the Internet Archive. Housed in the Little House on the Presidio in San Francisco, the Archive is actively collecting all the Web data that can be pulled down two T1 lines. This article describes Brewster and his archivists' experiences, and what they have learned from them. Designing the Archive Crawler Collecting Web data isn't a black art. You just need a program that can "speak" HTTP and parse the HTML to find links (in the form of URLs) to additional network objects. Such a "crawler" program can start almost anywhere on the Web, and will eventually find almost every public page in existence. The catch is that to accomplish anything useful you have to keep track of all the retrieved objects. To archive the data, you must put it somewhere, and to do this efficiently you must crawl across many sites at once. If you have a conscience, this must be done without upsetting the millions of Web authors and administrators who have created and are supporting the data. So, when the Archive set out to design its crawler, three things were accepted as givens: The proposed Standard for Robot Exclusion (SRE), which provides mechanisms for Web-site owners to control robot behavior on their sites, must be scrupulously obeyed. The design must permit splitting the crawl across several machines, possibly at different geographic locations. The objects retrieved must be "bundled" into large files, since the tape robot's filesystem cannot manage hundreds of millions of small objects. Beyond these constraints, the Archive had several concerns: The crawler should be polite, so as not to unduly burden a Web site when it visits. The crawling software must use the hardware resources efficiently, to optimize available bandwidth. The storage strategy should support the anticipated retrieval needs. Driven by these constraints and concerns, a common theme emerged in design discussions: how "expensive" it was to hop from site to site. If, for example, there were a huge list of unsorted URLs, and the crawler just grabbed one as needed, it would have to parse the URL to find the host and port, look up the IP address of the host, and fetch the robot exclusions for the site, all before requesting the object that the URL referenced. It would be difficult to prevent the same site being visited simultaneously by multiple crawlers and to retrieve the large number of "bundles" containing all of the objects for a given site. The Archive concluded that it would be easiest to crawl on a site-by-site basis. Thus, the following design emerged: The list of sites would be built by collecting "external" references from Web pages (those references pointing off-site). A single process would be assigned a queue of sites, and would crawl a number of them (currently 64) at once. Each process would loop through sites using nonblocking I/O in a "select loop" (even though the crawler actually uses the Solaris "poll" system call). The crawler would let sites "rest" between object retrievals, so as not to overload any one site. The advantages of this approach were manifold, and serendipitous ones emerged as the crawler developed. With this design, IP addresses and robot exclusions could be held in memory for each site being crawled, multiple crawling machines would never collide on the same site, and the data for a single site would all be contained in a small number of bundles (often just one). How it Works A typical Archive-crawler visit to a Web site begins when the crawling process fetches a site name and IP address, and the port number from the site queue; see Figure 1. The crawler then opens (or creates) the "crawl queue" for the site, which keeps track of the URL paths on the site that have been, or need to be, retrieved. If the queue does not already exist, the path referring to the root of the site ("/") is put in the queue. The next step is to retrieve and parse robots.txt, which may contain exclusions indicating paths the crawler should not pursue. Those exclusions are stored in memory for later use. One by one, the crawler fetches references to objects from the crawl queue. The path is checked against any exclusions for the site — what's not excluded is fetched. If the object has a content type of "text/html," the crawler parses the contents of the object looking for references to other documents. When one is found, the path is normalized. Relative paths are made absolute, and hostnames are made all lower case. If a reference such as were found in the document /chevrolet/classics/chevette.html, the path of the new URL would be normalized to /chevrolet/images/chevette.gif. If the reference is to an object on the same site, the crawler checks to see if it is new. If the reference is new it is added to the crawl queue, which is updated on disk so that the crawl can resume where it left off if it is interrupted. Fetched objects are written into the large "bundle" files, preceded by a line describing the object that includes the URL, the object size, and the date it was retrieved. You search for an object in a bundle file by reading the description line, then the object (because the size is in the description), then the next description, and so on until the desired object is found. This is tedious, however; an index is obviously necessary. Indexing The Archive indexes Web content differently from search engines such as AltaVista. Those systems create keyword or full-text indexes of the Web's textual content to permit queries such as find me all Web pages with the phrase "little red corvette" in them. The Archive is building more of a card catalog, to support queries such as: Tell me the modification times and locations in the Archive of all objects with the URL http://www.automobile.org/features/safetybelts.html. For the search services, indexing strategies are core business technologies that directly affect how quickly they keep up-to-date with the Web. For the Archive, indexing is merely a major headache. Assuming you have twenty gigabytes of disk space, it is pretty easy to create a table of 100 million URLs and another of 200 million object retrievals. Unfortunately, it takes a long time to find anything in such huge tables, unless you can afford a machine with 8 GB of memory to hold the table indexes in RAM. So the Archive has split the data into about a thousand smaller tables, numbered with a hashed value of the server name. While this has made queries tolerable, in the long run that 8GB machine will probably be needed. The URL Problem Once the Archive had a crawler design and a cataloging strategy, "all" that was left was programming. If you polled the few dozen engineers who have actually implemented an ambitious Web-crawling effort, they'd probably cite dealing with the domain-name service and answering the question "Do I know about this URL?" as the most annoying stumbling blocks. There are at least 100 million unique URLs on the Web. They include HTML documents, images, sound files, movies, applications, and a host of less common file types. As the crawler locates URLs in HTML documents, it must decide whether it has already retrieved the referenced object, or whether this is a new link. The problem is not even as "simple" as comparing it to the 100 million known links, since the same machine may have multiple names. The URLs http://www.automobile.org/vw/bug.html and http://home.automobile.org/vw/bug.html may be two names for the same object. The straightforward approach would be to load up a relational database with all URLs — including all aliases — and just query against it when a URL is located in an HTML document. But when fetching ten pages per second, with an average of 18 links per page (including href, src, background, url, code, and lowsrc), standard database engines just could not keep up. The Archive's solution is a giant bitmap, renowned as the Swiss army knife of high-performance programming; see Figure 2. The trick is to allocate a chunk of memory, zero it, and set bits to keep track of what you have already seen. To choose the bits, the crawler computes ten hash values of the URL string, using ten different hashing algorithms (well, actually, it uses one algorithm and ten different tables of ASCII-integer mappings). So when an URL is located, its hash array is computed, the appropriate bits are checked in the bitmap, and if all ten are already set, the URL is deemed "already found" (though not necessarily "already visited"). If any of the bits are not set, the URL is new; it is added to the queue and its bits are set in the bitmap. This approach is very fast, but may render "false positives" (identifying an unseen URL as already seen). Statistically, a bitmap with two bytes for each string being tracked is pretty safe. One has to wonder, though, if "index.html" hashes to the same values as some other common filename. As fast as the giant bitmap is, it is still difficult to work with the whole Web. Two bytes times 100 million is a lot of memory. Moreover, it is nearly impossible for multiple crawling machines to synchronize their bitmaps in real time. This is one of the problems serendipitously solved by the Archive's decision to crawl by site. Because most links in an HTML document are "local" to the site, most of the "Have we seen it?" questions can be answered by checking a bitmap specific to the site being crawled. This bitmap can be much smaller (usually 256K bits), and need not be synchronized with other bitmaps. External references are simply written to disk and batch-processed by a separate program. That program is the only one that needs to wield a 2GB bitmap, and that bitmap can be saved to disk when not in use. DNS The domain name service (DNS) is a clever, distributed database that allows the machines on the Internet to recognize each other in the absence of a central naming authority. A few top-level servers can tell you what machine to contact for information about machines in a given domain. A "domain" designates an organization, such as harvard.edu or microsoft.com; there are currently more than 800,000 registered domains in the United States, and many more abroad. For a domain to work properly, it must have a "nameserver," a machine that knows the names and addresses of all of the machines in the domain. So to get to cus.cam.ac.uk, a Web browser would first contact one of the top-level servers, which would redirect it to a server in the United Kingdom, which in turn would likely redirect it to a machine at Cambridge University that would provide the IP address for cus.cam.ac.uk. This name-resolution scheme works fine for casual browsing, which only requires a name lookup every few minutes, but it becomes a serious bottleneck for high-speed, widely distributed data collection. Another problem with DNS is the issue of uniqueness. It is common for a single machine to have several names; this creates intuitively named aliases for machines that provide common services (such as "www," "mail," or "news") and is a simple means for creating "virtual domains." Clearly, it would be undesirable to crawl the same machine multiple times, just because it had multiple names within DNS. Worse yet, multiple instances of the crawler might simultaneously access the same machine, causing undue load on the site. To avoid crawling the same data multiple times, the crawler must treat all names that resolve to the same IP address as the same host. So the IP address of a newly resolved hostname must be compared to all other IP addresses in the crawler's site queue to determine if the address is new. Unlike most crawling groups, the Archive has a third problem with DNS: tracking its history. The global namespace is in constant flux. The name www.automobile.org may point to model-t.automobile.org today, hudson.automobile.org next month, and ferrari.automobile.org next year. The Archive needs to keep track of these mappings, so that when a user asks to see www.automobile.org from December of 1996, the Archive will know that means the data fetched from model-t. Therefore, the Archive created a database to map DNS over time. Each record has a value, a type, an entry date, a superseded date, and an ID number. The value may hold either a name or an address and the type may be canonical, address, or alias. The "canonical" name of the machine is what the Archive actually calls it; it is always a canonical name within DNS, but is arbitrarily chosen if a given machine has multiple canonical DNS names. An address is the IP address that the Archive uses; if a machine has multiple IP addresses, the "extras" will be stored as aliases. An alias is a DNS alias ("www" is an alias about half of the time), an extra canonical name, or an extra address. The entry date is when the information was first looked up. The superseded date is when it became invalid; a current entry has an empty (NULL) superseded date. All records referring to the same canonical record share an ID number. Figure 3 shows how to retrieve the current address for www.automobile.org in SQL. In English, that would be: get me the address that hasn't expired for the host that is currently called www.automobile.org The DNS map is maintained separately from the crawl, so the crawling software never has to wait for name resolution to retrieve an object. Occasionally, the name-address pairs will become obsolete, but this is rare enough that the cost of revisiting is far lower than the cost of resolving the name more frequently. Conclusion Despite the trials of implementation, the tribulations of adequate network connectivity, and the travails of maintaining tape robots driven constantly at capacity, the Archive is being built. The crawler has essentially been running nonstop since October 1996, and as of the end of February 1997, the Archive has about 2.5 terabytes of data under management. What will become of all of the data the Archive is collecting? The answer will depend on how copyright laws are interpreted as they relate to digital media. Perhaps it will be a decade or more before the time capsule can be opened for all to see, or perhaps an acceptable means of electronically "permitting" the publication will emerge much sooner. Those who favor the latter see the Archive offering two essential services: a citation service for the Web and a solution to "link rot." Who has not been referred to a page by a publication, only to find that the content has changed? And most of us do not go a week without encountering a "404" (File Not Found) when following a link from our favorite search engine, because the referenced page is no longer there. What if Web Techniques could write "for an early example of the use of frames, see archive:19951012/www.webtechniques .com/frames.html," and be confident that its readers, even years hence, would see what they were meant to see? Or, what if you could actually find that page at http://www.irs.ustreas.gov that explained the 1995 self-employment tax rules, even though you were being audited in 1998? Most people believe the Archive is a Good Thing, but it is a monumental task, and they are happy to leave it to Brewster Kahle and his cohorts at the Internet Archive. (Also see "The Truth About the Web".) Mike is a systems architect at the Internet Archive and wrote the Archive's Web-crawling software. Contact him at mike@archive.org. }, author = {Burner, Mike}, interhash = {af763eaab27b3d464b6cfa04e0dc9ade}, intrahash = {583c11530a2aa109f0cc274ae0982675}, journal = {Web Techniques Magazine}, number = 5, title = {Crawling towards Eternity: Building An Archive of The World Wide Web }, url = {http://web.archive.org/web/20080101070319/http://www.webtechniques.com/archives/1997/05/burner/}, volume = 2, year = 1997 } @inproceedings{bensaad2011archiving, abstract = {A pattern is a model or a template used to summarize and describe the behavior (or the trend) of a data having generally some recurrent events. Patterns have received a considerable attention in recent years and were widely studied in the data mining field. Various pattern mining approaches have been proposed and used for different applications such as network monitoring, moving object tracking, financial or medical data analysis, scientific data processing, etc. In these different contexts, discovered patterns were useful to detect anomalies, to predict data behavior (or trend), or more generally, to simplify data processing or to improve system performance. However, to the best of our knowledge, patterns have never been used in the context of web archiving. Web archiving is the process of continuously collecting and preserving portions of the World Wide Web for future generations. In this paper, we show how patterns of page changes can be useful tools to efficiently archive web sites. We first define our pattern model that describes the changes of pages. Then, we present the strategy used to (i) extract the temporal evolution of page changes, to (ii) discover patterns and to (iii) exploit them to improve web archives. We choose the archive of French public TV channels « France Télévisions » as a case study in order to validate our approach. Our experimental evaluation based on real web pages shows the utility of patterns to improve archive quality and to optimize indexing or storing.}, acmid = {1998098}, address = {New York, NY, USA}, author = {Ben Saad, Myriam and Gançarski, Stéphane}, booktitle = {Proceedings of the 11th annual international ACM/IEEE joint conference on Digital libraries}, doi = {10.1145/1998076.1998098}, interhash = {88a952fa20259b4d32e583b523eea979}, intrahash = {07edd88128d243297d23786c54c78dce}, isbn = {978-1-4503-0744-4}, location = {Ottawa, Ontario, Canada}, numpages = {10}, pages = {113--122}, publisher = {ACM}, title = {Archiving the web using page changes patterns: a case study}, url = {http://doi.acm.org/10.1145/1998076.1998098}, year = 2011 }