@article{cho2006stanford, abstract = {We describe the design and performance of WebBase, a tool for Web research. The system includes a highly customizable crawler, a repository for collected Web pages, an indexer for both text and link-related page features, and a high-speed content distribution facility. The distribution module enables researchers world-wide to retrieve pages from WebBase, and stream them across the Internet at high speed. The advantage for the researchers is that they need not all crawl the Web before beginning their research. WebBase has been used by scores of research and teaching organizations world-wide, mostly for investigations into Web topology and linguistic content analysis. After describing the system's architecture, we explain our engineering decisions for each of the WebBase components, and present respective performance measurements.}, acmid = {1149124}, address = {New York, NY, USA}, author = {Cho, Junghoo and Garcia-Molina, Hector and Haveliwala, Taher and Lam, Wang and Paepcke, Andreas and Raghavan, Sriram and Wesley, Gary}, doi = {10.1145/1149121.1149124}, interhash = {bebbc072ea2dccf4c2b27abf244c1f08}, intrahash = {3cd21bf8a87619e0489b8da177c9f0b4}, issn = {1533-5399}, issue_date = {May 2006}, journal = {ACM Transactions on Internet Technology}, month = may, number = 2, numpages = {34}, pages = {153--186}, publisher = {ACM}, title = {Stanford WebBase components and applications}, url = {http://doi.acm.org/10.1145/1149121.1149124}, volume = 6, year = 2006 } @inproceedings{sigurdsson2005incremental, abstract = {The Heritrix web crawler aims to be the world's first open source, extensible, web-scale, archival-quality web crawler. It has however been limited in its crawling strategies to snapshot crawling. This paper reports on work to add the ability to conduct incremental crawls to its capabilities. We first discuss the concept of incremental crawling as opposed to snapshot crawling and then the possible ways to design an effective incremental strategy. An overview is given of the implementation that we did, its limits and strengths are discussed. We then report on the results of initial experimentation with the new software which have gone well. Finally, we discuss issues that remain unresolved and possible future improvements.}, address = {Vienna, Austria}, author = {Sigurðsson, Kristinn}, booktitle = {Proceedings of the 5th International Web Archiving Workshop (IWAW’05)}, interhash = {d84cf76a7001d472bd27dd092c0e1357}, intrahash = {1065880693b176515b5001af844e251f}, title = {Incremental crawling with Heretrix}, url = {http://iwaw.europarchive.org/05/papers/iwaw05-sigurdsson.pdf}, year = 2005 } @inproceedings{mohr2004introduction, abstract = {Heritrix is the Internet Archive's open-source, extensible, web-scale, archival-quality webcrawler project. The Internet Archive started Heritrix development in the early part of 2003. The intention was to develop a crawler for the specific purpose of archiving websites and to support multiple different use cases including focused and broadcrawling. The software is open source to encourage collaboration and joint development across institutions with similar needs. A pluggable, extensible architecture facilitates customization and outside contribution. Now, after over a year of development, the Internet Archive and other institutions are using Heritrix to perform focused and increasingly broad crawls.}, address = {Bath, UK}, author = {Mohr, G. and Kimpton, M. and Stack, M. and Ranitovic, I.}, booktitle = {Proceedings of the 4th International Web Archiving Workshop IWAW'04}, interhash = {4d9fda8f3428384167ee23949442643d}, intrahash = {09d70d4ea1810fe89522755a0982169f}, month = jul, title = {Introduction to heritrix, an archival quality web crawler}, url = {http://crawler.archive.org/Mohr-et-al-2004.pdf}, year = 2004 } @article{burner1997crawling, abstract = { Crawling towards Eternity Building An Archive of The World Wide Web By Mike Burner When you ask what he's up to, Brewster Kahle likes to surprise you. So when designing supercomputers became cliche — and everybody was developing new technologies for full-text indexing — Brewster decided to archive the Internet. He thought the Internet would fit nicely into a box. The box he had in mind was a tape robot that, if it held enough tapes, could hold terabytes of data. Into this box, Brewster wanted to cram all of the publicly accessible data from the World Wide Web, anonymous FTP sites, USENET news, and public gopher sites. His vision was that this would become an archive of digital history, available forever as a research tool and time capsule. By visiting this "library," the world would be able to trace the development of technologies, styles, and cultural trends. Brewster knew it would be necessary to transcribe the archive on a continuing basis, lest the media deteriorate or the ability to read it disappear. Thus was born the Internet Archive. Housed in the Little House on the Presidio in San Francisco, the Archive is actively collecting all the Web data that can be pulled down two T1 lines. This article describes Brewster and his archivists' experiences, and what they have learned from them. Designing the Archive Crawler Collecting Web data isn't a black art. You just need a program that can "speak" HTTP and parse the HTML to find links (in the form of URLs) to additional network objects. Such a "crawler" program can start almost anywhere on the Web, and will eventually find almost every public page in existence. The catch is that to accomplish anything useful you have to keep track of all the retrieved objects. To archive the data, you must put it somewhere, and to do this efficiently you must crawl across many sites at once. If you have a conscience, this must be done without upsetting the millions of Web authors and administrators who have created and are supporting the data. So, when the Archive set out to design its crawler, three things were accepted as givens: The proposed Standard for Robot Exclusion (SRE), which provides mechanisms for Web-site owners to control robot behavior on their sites, must be scrupulously obeyed. The design must permit splitting the crawl across several machines, possibly at different geographic locations. The objects retrieved must be "bundled" into large files, since the tape robot's filesystem cannot manage hundreds of millions of small objects. Beyond these constraints, the Archive had several concerns: The crawler should be polite, so as not to unduly burden a Web site when it visits. The crawling software must use the hardware resources efficiently, to optimize available bandwidth. The storage strategy should support the anticipated retrieval needs. Driven by these constraints and concerns, a common theme emerged in design discussions: how "expensive" it was to hop from site to site. If, for example, there were a huge list of unsorted URLs, and the crawler just grabbed one as needed, it would have to parse the URL to find the host and port, look up the IP address of the host, and fetch the robot exclusions for the site, all before requesting the object that the URL referenced. It would be difficult to prevent the same site being visited simultaneously by multiple crawlers and to retrieve the large number of "bundles" containing all of the objects for a given site. The Archive concluded that it would be easiest to crawl on a site-by-site basis. Thus, the following design emerged: The list of sites would be built by collecting "external" references from Web pages (those references pointing off-site). A single process would be assigned a queue of sites, and would crawl a number of them (currently 64) at once. Each process would loop through sites using nonblocking I/O in a "select loop" (even though the crawler actually uses the Solaris "poll" system call). The crawler would let sites "rest" between object retrievals, so as not to overload any one site. The advantages of this approach were manifold, and serendipitous ones emerged as the crawler developed. With this design, IP addresses and robot exclusions could be held in memory for each site being crawled, multiple crawling machines would never collide on the same site, and the data for a single site would all be contained in a small number of bundles (often just one). How it Works A typical Archive-crawler visit to a Web site begins when the crawling process fetches a site name and IP address, and the port number from the site queue; see Figure 1. The crawler then opens (or creates) the "crawl queue" for the site, which keeps track of the URL paths on the site that have been, or need to be, retrieved. If the queue does not already exist, the path referring to the root of the site ("/") is put in the queue. The next step is to retrieve and parse robots.txt, which may contain exclusions indicating paths the crawler should not pursue. Those exclusions are stored in memory for later use. One by one, the crawler fetches references to objects from the crawl queue. The path is checked against any exclusions for the site — what's not excluded is fetched. If the object has a content type of "text/html," the crawler parses the contents of the object looking for references to other documents. When one is found, the path is normalized. Relative paths are made absolute, and hostnames are made all lower case. If a reference such as were found in the document /chevrolet/classics/chevette.html, the path of the new URL would be normalized to /chevrolet/images/chevette.gif. If the reference is to an object on the same site, the crawler checks to see if it is new. If the reference is new it is added to the crawl queue, which is updated on disk so that the crawl can resume where it left off if it is interrupted. Fetched objects are written into the large "bundle" files, preceded by a line describing the object that includes the URL, the object size, and the date it was retrieved. You search for an object in a bundle file by reading the description line, then the object (because the size is in the description), then the next description, and so on until the desired object is found. This is tedious, however; an index is obviously necessary. Indexing The Archive indexes Web content differently from search engines such as AltaVista. Those systems create keyword or full-text indexes of the Web's textual content to permit queries such as find me all Web pages with the phrase "little red corvette" in them. The Archive is building more of a card catalog, to support queries such as: Tell me the modification times and locations in the Archive of all objects with the URL http://www.automobile.org/features/safetybelts.html. For the search services, indexing strategies are core business technologies that directly affect how quickly they keep up-to-date with the Web. For the Archive, indexing is merely a major headache. Assuming you have twenty gigabytes of disk space, it is pretty easy to create a table of 100 million URLs and another of 200 million object retrievals. Unfortunately, it takes a long time to find anything in such huge tables, unless you can afford a machine with 8 GB of memory to hold the table indexes in RAM. So the Archive has split the data into about a thousand smaller tables, numbered with a hashed value of the server name. While this has made queries tolerable, in the long run that 8GB machine will probably be needed. The URL Problem Once the Archive had a crawler design and a cataloging strategy, "all" that was left was programming. If you polled the few dozen engineers who have actually implemented an ambitious Web-crawling effort, they'd probably cite dealing with the domain-name service and answering the question "Do I know about this URL?" as the most annoying stumbling blocks. There are at least 100 million unique URLs on the Web. They include HTML documents, images, sound files, movies, applications, and a host of less common file types. As the crawler locates URLs in HTML documents, it must decide whether it has already retrieved the referenced object, or whether this is a new link. The problem is not even as "simple" as comparing it to the 100 million known links, since the same machine may have multiple names. The URLs http://www.automobile.org/vw/bug.html and http://home.automobile.org/vw/bug.html may be two names for the same object. The straightforward approach would be to load up a relational database with all URLs — including all aliases — and just query against it when a URL is located in an HTML document. But when fetching ten pages per second, with an average of 18 links per page (including href, src, background, url, code, and lowsrc), standard database engines just could not keep up. The Archive's solution is a giant bitmap, renowned as the Swiss army knife of high-performance programming; see Figure 2. The trick is to allocate a chunk of memory, zero it, and set bits to keep track of what you have already seen. To choose the bits, the crawler computes ten hash values of the URL string, using ten different hashing algorithms (well, actually, it uses one algorithm and ten different tables of ASCII-integer mappings). So when an URL is located, its hash array is computed, the appropriate bits are checked in the bitmap, and if all ten are already set, the URL is deemed "already found" (though not necessarily "already visited"). If any of the bits are not set, the URL is new; it is added to the queue and its bits are set in the bitmap. This approach is very fast, but may render "false positives" (identifying an unseen URL as already seen). Statistically, a bitmap with two bytes for each string being tracked is pretty safe. One has to wonder, though, if "index.html" hashes to the same values as some other common filename. As fast as the giant bitmap is, it is still difficult to work with the whole Web. Two bytes times 100 million is a lot of memory. Moreover, it is nearly impossible for multiple crawling machines to synchronize their bitmaps in real time. This is one of the problems serendipitously solved by the Archive's decision to crawl by site. Because most links in an HTML document are "local" to the site, most of the "Have we seen it?" questions can be answered by checking a bitmap specific to the site being crawled. This bitmap can be much smaller (usually 256K bits), and need not be synchronized with other bitmaps. External references are simply written to disk and batch-processed by a separate program. That program is the only one that needs to wield a 2GB bitmap, and that bitmap can be saved to disk when not in use. DNS The domain name service (DNS) is a clever, distributed database that allows the machines on the Internet to recognize each other in the absence of a central naming authority. A few top-level servers can tell you what machine to contact for information about machines in a given domain. A "domain" designates an organization, such as harvard.edu or microsoft.com; there are currently more than 800,000 registered domains in the United States, and many more abroad. For a domain to work properly, it must have a "nameserver," a machine that knows the names and addresses of all of the machines in the domain. So to get to cus.cam.ac.uk, a Web browser would first contact one of the top-level servers, which would redirect it to a server in the United Kingdom, which in turn would likely redirect it to a machine at Cambridge University that would provide the IP address for cus.cam.ac.uk. This name-resolution scheme works fine for casual browsing, which only requires a name lookup every few minutes, but it becomes a serious bottleneck for high-speed, widely distributed data collection. Another problem with DNS is the issue of uniqueness. It is common for a single machine to have several names; this creates intuitively named aliases for machines that provide common services (such as "www," "mail," or "news") and is a simple means for creating "virtual domains." Clearly, it would be undesirable to crawl the same machine multiple times, just because it had multiple names within DNS. Worse yet, multiple instances of the crawler might simultaneously access the same machine, causing undue load on the site. To avoid crawling the same data multiple times, the crawler must treat all names that resolve to the same IP address as the same host. So the IP address of a newly resolved hostname must be compared to all other IP addresses in the crawler's site queue to determine if the address is new. Unlike most crawling groups, the Archive has a third problem with DNS: tracking its history. The global namespace is in constant flux. The name www.automobile.org may point to model-t.automobile.org today, hudson.automobile.org next month, and ferrari.automobile.org next year. The Archive needs to keep track of these mappings, so that when a user asks to see www.automobile.org from December of 1996, the Archive will know that means the data fetched from model-t. Therefore, the Archive created a database to map DNS over time. Each record has a value, a type, an entry date, a superseded date, and an ID number. The value may hold either a name or an address and the type may be canonical, address, or alias. The "canonical" name of the machine is what the Archive actually calls it; it is always a canonical name within DNS, but is arbitrarily chosen if a given machine has multiple canonical DNS names. An address is the IP address that the Archive uses; if a machine has multiple IP addresses, the "extras" will be stored as aliases. An alias is a DNS alias ("www" is an alias about half of the time), an extra canonical name, or an extra address. The entry date is when the information was first looked up. The superseded date is when it became invalid; a current entry has an empty (NULL) superseded date. All records referring to the same canonical record share an ID number. Figure 3 shows how to retrieve the current address for www.automobile.org in SQL. In English, that would be: get me the address that hasn't expired for the host that is currently called www.automobile.org The DNS map is maintained separately from the crawl, so the crawling software never has to wait for name resolution to retrieve an object. Occasionally, the name-address pairs will become obsolete, but this is rare enough that the cost of revisiting is far lower than the cost of resolving the name more frequently. Conclusion Despite the trials of implementation, the tribulations of adequate network connectivity, and the travails of maintaining tape robots driven constantly at capacity, the Archive is being built. The crawler has essentially been running nonstop since October 1996, and as of the end of February 1997, the Archive has about 2.5 terabytes of data under management. What will become of all of the data the Archive is collecting? The answer will depend on how copyright laws are interpreted as they relate to digital media. Perhaps it will be a decade or more before the time capsule can be opened for all to see, or perhaps an acceptable means of electronically "permitting" the publication will emerge much sooner. Those who favor the latter see the Archive offering two essential services: a citation service for the Web and a solution to "link rot." Who has not been referred to a page by a publication, only to find that the content has changed? And most of us do not go a week without encountering a "404" (File Not Found) when following a link from our favorite search engine, because the referenced page is no longer there. What if Web Techniques could write "for an early example of the use of frames, see archive:19951012/www.webtechniques .com/frames.html," and be confident that its readers, even years hence, would see what they were meant to see? Or, what if you could actually find that page at http://www.irs.ustreas.gov that explained the 1995 self-employment tax rules, even though you were being audited in 1998? Most people believe the Archive is a Good Thing, but it is a monumental task, and they are happy to leave it to Brewster Kahle and his cohorts at the Internet Archive. (Also see "The Truth About the Web".) Mike is a systems architect at the Internet Archive and wrote the Archive's Web-crawling software. Contact him at mike@archive.org. }, author = {Burner, Mike}, interhash = {af763eaab27b3d464b6cfa04e0dc9ade}, intrahash = {583c11530a2aa109f0cc274ae0982675}, journal = {Web Techniques Magazine}, number = 5, title = {Crawling towards Eternity: Building An Archive of The World Wide Web }, url = {http://web.archive.org/web/20080101070319/http://www.webtechniques.com/archives/1997/05/burner/}, volume = 2, year = 1997 } @inproceedings{liu2011browsing, abstract = {To optimize the performance of web crawlers, various page importance measures have been studied to select and order URLs in crawling. Most sophisticated measures (e.g. breadth-first and PageRank) are based on link structure. In this paper, we treat the problem from another perspective and propose to measure page importance through mining user interest and behaviors from web browse logs. Unlike most existing approaches which work on single URL, in this paper, both the log mining and the crawl ordering are performed at the granularity of URL pattern. The proposed URL pattern-based crawl orderings are capable to properly predict the importance of newly created (unseen) URLs. Promising experimental results proved the feasibility of our approach.}, acmid = {2063593}, address = {New York, NY, USA}, author = {Liu, Minghai and Cai, Rui and Zhang, Ming and Zhang, Lei}, booktitle = {Proceedings of the 20th ACM international conference on Information and knowledge management}, doi = {10.1145/2063576.2063593}, interhash = {7b45567cb6a492d8354dc32401549291}, intrahash = {3ce89bd8a3d3eb6306b739fe1f4088df}, isbn = {978-1-4503-0717-8}, location = {Glasgow, Scotland, UK}, numpages = {6}, pages = {87--92}, publisher = {ACM}, title = {User browsing behavior-driven web crawling}, url = {http://doi.acm.org/10.1145/2063576.2063593}, year = 2011 } @inproceedings{bai2011discovering, abstract = {Search engines rely upon crawling to build their Web page collections. A Web crawler typically discovers new URLs by following the link structure induced by links on Web pages. As the number of documents on the Web is large, discovering newly created URLs may take arbitrarily long, and depending on how a given page is connected to others, such a crawler may miss the pages altogether. In this paper, we evaluate the benefits of integrating a passive URL discovery mechanism into a Web crawler. This mechanism is passive in the sense that it does not require the crawler to actively fetch documents from the Web to discover URLs. We focus here on a mechanism that uses toolbar data as a representative source for new URL discovery. We use the toolbar logs of Yahoo! to characterize the URLs that are accessed by users via their browsers, but not discovered by Yahoo! Web crawler. We show that a high fraction of URLs that appear in toolbar logs are not discovered by the crawler. We also reveal that a certain fraction of URLs are discovered by the crawler later than the time they are first accessed by users. One important conclusion of our work is that web search engines can highly benefit from user feedback in the form of toolbar logs for passive URL discovery.}, acmid = {2063592}, address = {New York, NY, USA}, author = {Bai, Xiao and Cambazoglu, B. Barla and Junqueira, Flavio P.}, booktitle = {Proceedings of the 20th ACM international conference on Information and knowledge management}, doi = {10.1145/2063576.2063592}, interhash = {dfef0e1af73b9c9e5096a2118368ad21}, intrahash = {4e73c9d6ed79931ccdfcfda938e3be62}, isbn = {978-1-4503-0717-8}, location = {Glasgow, Scotland, UK}, numpages = {10}, pages = {77--86}, publisher = {ACM}, title = {Discovering URLs through user feedback}, url = {http://doi.acm.org/10.1145/2063576.2063592}, year = 2011 } @inproceedings{bensaad2011archiving, abstract = {A pattern is a model or a template used to summarize and describe the behavior (or the trend) of a data having generally some recurrent events. Patterns have received a considerable attention in recent years and were widely studied in the data mining field. Various pattern mining approaches have been proposed and used for different applications such as network monitoring, moving object tracking, financial or medical data analysis, scientific data processing, etc. In these different contexts, discovered patterns were useful to detect anomalies, to predict data behavior (or trend), or more generally, to simplify data processing or to improve system performance. However, to the best of our knowledge, patterns have never been used in the context of web archiving. Web archiving is the process of continuously collecting and preserving portions of the World Wide Web for future generations. In this paper, we show how patterns of page changes can be useful tools to efficiently archive web sites. We first define our pattern model that describes the changes of pages. Then, we present the strategy used to (i) extract the temporal evolution of page changes, to (ii) discover patterns and to (iii) exploit them to improve web archives. We choose the archive of French public TV channels « France Télévisions » as a case study in order to validate our approach. Our experimental evaluation based on real web pages shows the utility of patterns to improve archive quality and to optimize indexing or storing.}, acmid = {1998098}, address = {New York, NY, USA}, author = {Ben Saad, Myriam and Gançarski, Stéphane}, booktitle = {Proceedings of the 11th annual international ACM/IEEE joint conference on Digital libraries}, doi = {10.1145/1998076.1998098}, interhash = {88a952fa20259b4d32e583b523eea979}, intrahash = {07edd88128d243297d23786c54c78dce}, isbn = {978-1-4503-0744-4}, location = {Ottawa, Ontario, Canada}, numpages = {10}, pages = {113--122}, publisher = {ACM}, title = {Archiving the web using page changes patterns: a case study}, url = {http://doi.acm.org/10.1145/1998076.1998098}, year = 2011 } @inproceedings{cho2007rankmass, abstract = {Crawling algorithms have been the subject of extensive research and optimizations, but some important questions remain open. In particular, given the unbounded number of pages available on the Web, search-engine operators constantly struggle with the following vexing questions: When can I stop downloading the Web? How many pages should I download to cover "most" of the Web? How can I know I am not missing an important part when I stop? In this paper we provide an answer to these questions by developing, in the context of a system that is given a set of trusted pages, a family of crawling algorithms that (1) provide a theoretical guarantee on how much of the "important" part of the Web it will download after crawling a certain number of pages and (2) give a high priority to important pages during a crawl, so that the search engine can index the most important part of the Web first. We prove the correctness of our algorithms by theoretical analysis and evaluate their performance experimentally based on 141 million URLs obtained from the Web. Our experiments demonstrate that even our simple algorithm is effective in downloading important pages early on and provides high "coverage" of the Web with a relatively small number of pages.}, acmid = {1325897}, author = {Cho, Junghoo and Schonfeld, Uri}, booktitle = {Proceedings of the 33rd international conference on Very large data bases}, interhash = {c5573f70e067624e3a559996172a45ef}, intrahash = {3227ef077a463fbaa6ba1ac7aac82d06}, isbn = {978-1-59593-649-3}, location = {Vienna, Austria}, numpages = {12}, pages = {375--386}, publisher = {VLDB Endowment}, title = {RankMass crawler: a crawler with high personalized pagerank coverage guarantee}, url = {http://dl.acm.org/citation.cfm?id=1325851.1325897}, year = 2007 } @inproceedings{olston2008recrawl, abstract = {It is crucial for a web crawler to distinguish between ephemeral and persistent content. Ephemeral content (e.g., quote of the day) is usually not worth crawling, because by the time it reaches the index it is no longer representative of the web page from which it was acquired. On the other hand, content that persists across multiple page updates (e.g., recent blog postings) may be worth acquiring, because it matches the page's true content for a sustained period of time.

In this paper we characterize the longevity of information found on the web, via both empirical measurements and a generative model that coincides with these measurements. We then develop new recrawl scheduling policies that take longevity into account. As we show via experiments over real web data, our policies obtain better freshness at lower cost, compared with previous approaches.}, acmid = {1367557}, address = {New York, NY, USA}, author = {Olston, Christopher and Pandey, Sandeep}, booktitle = {Proceedings of the 17th international conference on World Wide Web}, doi = {10.1145/1367497.1367557}, interhash = {62dabc7c7aa03203804fde1b32b5fbe0}, intrahash = {68ecda3b2d943f8625add57a3a2f3a7c}, isbn = {978-1-60558-085-2}, location = {Beijing, China}, numpages = {10}, pages = {437--446}, publisher = {ACM}, title = {Recrawl scheduling based on information longevity}, url = {http://doi.acm.org/10.1145/1367497.1367557}, year = 2008 } @inproceedings{pandey2005usercentric, abstract = {Search engines are the primary gateways of information access on the Web today. Behind the scenes, search engines crawl the Web to populate a local indexed repository of Web pages, used to answer user search queries. In an aggregate sense, the Web is very dynamic, causing any repository of Web pages to become out of date over time, which in turn causes query answer quality to degrade. Given the considerable size, dynamicity, and degree of autonomy of the Web as a whole, it is not feasible for a search engine to maintain its repository exactly synchronized with the Web.In this paper we study how to schedule Web pages for selective (re)downloading into a search engine repository. The scheduling objective is to maximize the quality of the user experience for those who query the search engine. We begin with a quantitative characterization of the way in which the discrepancy between the content of the repository and the current content of the live Web impacts the quality of the user experience. This characterization leads to a user-centric metric of the quality of a search engine's local repository. We use this metric to derive a policy for scheduling Web page (re)downloading that is driven by search engine usage and free of exterior tuning parameters. We then focus on the important subproblem of scheduling refreshing of Web pages already present in the repository, and show how to compute the priorities efficiently. We provide extensive empirical comparisons of our user-centric method against prior Web page refresh strategies, using real Web data. Our results demonstrate that our method requires far fewer resources to maintain same search engine quality level for users, leaving substantially more resources available for incorporating new Web pages into the search repository.}, acmid = {1060805}, address = {New York, NY, USA}, author = {Pandey, Sandeep and Olston, Christopher}, booktitle = {Proceedings of the 14th international conference on World Wide Web}, doi = {10.1145/1060745.1060805}, interhash = {4d0e8067c9240b05c42bf8e174ffb1d1}, intrahash = {166a0a9f8d80beeab0c75961398d951f}, isbn = {1-59593-046-9}, location = {Chiba, Japan}, numpages = {11}, pages = {401--411}, publisher = {ACM}, title = {User-centric Web crawling}, url = {http://doi.acm.org/10.1145/1060745.1060805}, year = 2005 } @phdthesis{castillo2004effective, abstract = {The key factors for the success of the World Wide Web are its large size and the lack of a centralized control over its contents. Both issues are also the most important source of problems for locating information. The Web is a context in which traditional Information Retrieval methods are challenged, and given the volume of the Web and its speed of change, the coverage of modern search engines is relatively small. Moreover, the distribution of quality is very skewed, and interesting pages are scarce in comparison with the rest of the content. Web crawling is the process used by search engines to collect pages from the Web. This thesis studies Web crawling at several different levels, ranging from the long-term goal of crawling important pages first, to the short-term goal of using the network connectivity efficiently, including implementation issues that are essential for crawling in practice. We start by designing a new model and architecture for aWeb crawler that tightly integrates the crawler with the rest of the search engine, providing access to the metadata and links of the documents that can be used to guide the crawling process effectively. We implement this design in the WIRE project as an efficient Web crawler that provides an experimental framework for this research. In fact, we have used our crawler to characterize the Chilean Web, using the results as feedback to improve the crawler design. We argue that the number of pages on the Web can be considered infinite, and given that a Web crawler cannot download all the pages, it is important to capture the most important ones as early as possible during the crawling process. We propose, study, and implement algorithms for achieving this goal, showing that we can crawl 50% of a large Web collection and capture 80% of its total Pagerank value in both simulated and real Web environments. We also model and study user browsing behavior in Web sites, concluding that it is not necessary to go deeper than five levels from the home page to capture most of the pages actually visited by people, and support this conclusion with log analysis of several Web sites. We also propose several mechanisms for server cooperation to reduce network traffic and improve the representation of aWeb page in a search engine with the help of Web site managers.}, address = {Santiago, Chile}, author = {Castillo, Carlos}, institution = {University of Chile}, interhash = {36eac63e7cfae05bc7444171432a6f3f}, intrahash = {38b52bf7ccc2e1221477f5d8937c3b7d}, month = {November}, school = {School of Engineering}, title = {Effective Web Crawling}, url = {http://www.chato.cl/crawling/}, year = 2004 } @inproceedings{tane03courseware, abstract = {Topics in education are changing with an ever faster pace. E-Learning resources tend to be more and more decentralised. Users need increasingly to be able to use the resources of the web. For this, they should have tools for finding and organizing information in a decentral way. In this, paper, we show how an ontology-based tool suite allows to make the most of the resources available on the web.}, author = {Tane, Julien and Schmitz, Christoph and Stumme, Gerd and Staab, Steffen and Studer, R.}, booktitle = {Mobiles Lernen und Forschen - Beiträge der Fachtagung an der Universität}, comment = {alpha}, editor = {David, Klaus and Wegner, Lutz}, interhash = {7f33080bb78d089b24bf51c059f8f018}, intrahash = {850949481723b7dd03768ccd96b25cb9}, month = {November}, pages = {93-104}, publisher = {Kassel University Press}, title = {The Courseware Watchdog: an Ontology-based tool for finding and organizing learning material}, url = {http://www.kde.cs.uni-kassel.de/stumme/papers/2003/tane2003courseware.pdf}, year = 2003 } @inproceedings{952761, abstract = {The Web, the largest unstructured database of the world, has greatly improved access to documents. However, documents on the Web are largely disorganized. Due to the distributed nature of the World Wide Web it is difficult to use it as a tool for information and knowledge management. Therefore, users doing the difficult task of exploring the Web have to be supported by intelligent means.This paper proposes an approach for document discovery building on a comprehensive framework for ontology-focused crawling of Web documents. Our framework includes means for using a complex ontology and associated instance elements. It defines several relevance computation strategies and provides an empirical evaluation which has shown promising results.}, address = {New York, NY, USA}, author = {Ehrig, Marc and Maedche, Alexander}, booktitle = {SAC '03: Proceedings of the 2003 ACM symposium on Applied computing}, doi = {http://doi.acm.org/10.1145/952532.952761}, interhash = {28f82d45e89849ce80f05ac5d11e8611}, intrahash = {9d9bcba93b086195f41402c83da3ff07}, isbn = {1-58113-624-2}, location = {Melbourne, Florida}, pages = {1174--1178}, publisher = {ACM}, title = {Ontology-focused crawling of Web documents}, url = {http://www.aifb.uni-karlsruhe.de/WBS/meh/publications/ehrig03ontology.pdf}, year = 2003 } @article{Chakrabartietal99, author = {Chakrabarti, S. and van den Berg, M. and Dom, B.}, interhash = {e35ac8e9c02ab2a5075b9c1692ac7a2d}, intrahash = {004dd97a2b2e71fa2cfe6820c74c9701}, isbn = {90-74821-43-X}, journal = {Computer Networks}, pages = {1623--1640}, title = {Focused Crawling: A New Approach to Topic-Specific Web Resource Discovery}, url = {citeseer.nj.nec.com/chakrabarti99focused.html}, volume = 31, year = 1999 } @inproceedings{69, address = {Cairo}, author = {Diligenti, M. and Coetzee, F. and Lawrence, S. and Giles, C.L. and Gori, M.}, booktitle = {Proceedings of the 26th International Conference onVery Large Databases (VLDB)}, interhash = {771abf0df1f36e80e33ab92508602a6e}, intrahash = {8e47c9be504ca398049afc038f4004d6}, isbn = {90-74821-43-X}, month = {September}, pages = {527--534}, title = {Focused crawling using context graphs}, url = {http://www.neci.nec.com/~lawrence/papers/focus-vldb00/focus-vldb00.pdf}, year = 2000 } @inproceedings{Maedcheetalsubmitted, address = {Hawaii}, author = {Maedche, A. and Ehrig, M. and Handschuh, S. and Stojanovic, L. and Volz, R.}, booktitle = {Proceedings of the Eleventh International World Wide Web Conference WWW-2002}, interhash = {833fc61ede44e31a5af6c77c86baa43a}, intrahash = {f20b5a51398bb01659ad099794ebf06d}, isbn = {90-74821-43-X}, title = {Ontology-Focused Crawling of Documents and Relational Metadata}, year = 2002 } @inproceedings{Aggarwal02, author = {Aggarwal, C.C.}, bb-date = {22 February 1999}, booktitle = {\cite{KDD2002}}, interhash = {594fa3392643b7c5877e27a200331db6}, intrahash = {917a28c3f139ca98241d7c3ef09d3389}, pages = {423--428}, title = {Collaborative Crawling: Mining User Experiences for Topical Resource Discovery}, year = 2002 }