@incollection{pieper2009wissenschaftliche, abstract = {Dieser Beitrag untersucht, in welchem Umfang Dokumente von Dokumentenservern wissenschaftlicher Institutionen in den allgemeinen Suchmaschinen Google und Yahoo nachgewiesen sind und inwieweit wissenschaftliche Suchmaschinen für die Suche nach solchen Dokumenten besser geeignet sind. Dazu werden die fünf Suchmaschinen BASE, Google Scholar, OAIster, Scientific Commons und Scirus überblickartig beschrieben und miteinander verglichen. Hauptaugenmerk wird dabei auf die unterschiedlichen Inhalte, Suchfunktionen und Ausgabemöglichkeiten gelegt, mit Hilfe eines Retrievaltests wird speziell die Leistungsfähigkeit der Suchmaschinen beim Auffinden von Dokumenten, deren Volltexte im Sinne des Open Access direkt und ohne Beschränkungen aufrufbar sind, untersucht.}, author = {Pieper, Dirk and Wolf, Sebastian}, booktitle = {Handbuch Internet-Suchmaschinen: Nutzerorientierung in Wissenschaft und Praxis}, editor = {Dirk, Lewandowski}, interhash = {b915fb45a9a6dc3499247e76992c7897}, intrahash = {1f997db426731303690c9bb962f1c158}, pages = {356--374}, publisher = {Akademische Verlagsgesellschaft AKA}, title = {Wissenschaftliche Dokumente in Suchmaschinen}, url = {http://eprints.rclis.org/12746/}, year = 2009 } @book{croft2010search, address = {Boston}, author = {Croft, W. Bruce and Metzler, Donald and Strohman, Trevor}, edition = {1st}, interhash = {bc2cb2c872ddae363967b53064670cd8}, intrahash = {6dbe8ff9de4f8b16c442247baf8abe73}, isbn = {9780136072249 0136072240}, month = feb, publisher = {Addison-Wesley}, refid = {268788295}, title = {Search engines: information retrieval in practice}, url = {http://www.amazon.com/Search-Engines-Information-Retrieval-Practice/dp/0136072240}, year = 2010 } @inproceedings{liu2011browsing, abstract = {To optimize the performance of web crawlers, various page importance measures have been studied to select and order URLs in crawling. Most sophisticated measures (e.g. breadth-first and PageRank) are based on link structure. In this paper, we treat the problem from another perspective and propose to measure page importance through mining user interest and behaviors from web browse logs. Unlike most existing approaches which work on single URL, in this paper, both the log mining and the crawl ordering are performed at the granularity of URL pattern. The proposed URL pattern-based crawl orderings are capable to properly predict the importance of newly created (unseen) URLs. Promising experimental results proved the feasibility of our approach.}, acmid = {2063593}, address = {New York, NY, USA}, author = {Liu, Minghai and Cai, Rui and Zhang, Ming and Zhang, Lei}, booktitle = {Proceedings of the 20th ACM international conference on Information and knowledge management}, doi = {10.1145/2063576.2063593}, interhash = {7b45567cb6a492d8354dc32401549291}, intrahash = {3ce89bd8a3d3eb6306b739fe1f4088df}, isbn = {978-1-4503-0717-8}, location = {Glasgow, Scotland, UK}, numpages = {6}, pages = {87--92}, publisher = {ACM}, title = {User browsing behavior-driven web crawling}, url = {http://doi.acm.org/10.1145/2063576.2063593}, year = 2011 } @inproceedings{bahmani2012pagerank, abstract = {One of the most important features of the Web graph and social networks is that they are constantly evolving. The classical computational paradigm, which assumes a fixed data set as an input to an algorithm that terminates, is inadequate for such settings. In this paper we study the problem of computing PageRank on an evolving graph. We propose an algorithm that, at any moment in the time and by crawling a small portion of the graph, provides an estimate of the PageRank that is close to the true PageRank of the graph at that moment. We will also evaluate our algorithm experimentally on real data sets and on randomly generated inputs. Under a stylized model of graph evolution, we show that our algorithm achieves a provable performance guarantee that is significantly better than the naive algorithm that crawls the nodes in a round-robin fashion.}, acmid = {2339539}, address = {New York, NY, USA}, author = {Bahmani, Bahman and Kumar, Ravi and Mahdian, Mohammad and Upfal, Eli}, booktitle = {Proceedings of the 18th ACM SIGKDD international conference on Knowledge discovery and data mining}, doi = {10.1145/2339530.2339539}, interhash = {4572c8d52b91bf4487183b6c3b900827}, intrahash = {6058356e9c5a62b3993686ff5eac9529}, isbn = {978-1-4503-1462-6}, location = {Beijing, China}, numpages = {9}, pages = {24--32}, publisher = {ACM}, title = {PageRank on an evolving graph}, url = {http://doi.acm.org/10.1145/2339530.2339539}, year = 2012 } @inproceedings{cho2007rankmass, abstract = {Crawling algorithms have been the subject of extensive research and optimizations, but some important questions remain open. In particular, given the unbounded number of pages available on the Web, search-engine operators constantly struggle with the following vexing questions: When can I stop downloading the Web? How many pages should I download to cover "most" of the Web? How can I know I am not missing an important part when I stop? In this paper we provide an answer to these questions by developing, in the context of a system that is given a set of trusted pages, a family of crawling algorithms that (1) provide a theoretical guarantee on how much of the "important" part of the Web it will download after crawling a certain number of pages and (2) give a high priority to important pages during a crawl, so that the search engine can index the most important part of the Web first. We prove the correctness of our algorithms by theoretical analysis and evaluate their performance experimentally based on 141 million URLs obtained from the Web. Our experiments demonstrate that even our simple algorithm is effective in downloading important pages early on and provides high "coverage" of the Web with a relatively small number of pages.}, acmid = {1325897}, author = {Cho, Junghoo and Schonfeld, Uri}, booktitle = {Proceedings of the 33rd international conference on Very large data bases}, interhash = {c5573f70e067624e3a559996172a45ef}, intrahash = {3227ef077a463fbaa6ba1ac7aac82d06}, isbn = {978-1-59593-649-3}, location = {Vienna, Austria}, numpages = {12}, pages = {375--386}, publisher = {VLDB Endowment}, title = {RankMass crawler: a crawler with high personalized pagerank coverage guarantee}, url = {http://dl.acm.org/citation.cfm?id=1325851.1325897}, year = 2007 }