@article{morstatter2013sample, author = {Morstatter, Fred and {\"u}rgen Pfeffer, J and Liu, Huan and Carley, Kathleen M}, interhash = {bca742d25a5f5fa43c8f106460449b5b}, intrahash = {58707a28cc5098b9b3444501d5ca9a88}, title = {Is the Sample Good Enough? Comparing Data from Twitter’s Streaming API with Twitter’s Firehose}, url = {http://scholar.google.de/scholar.bib?q=info:NkS2afIrqyQJ:scholar.google.com/&output=citation&hl=de&as_sdt=0,5&ct=citation&cd=0}, year = 2013 } @inproceedings{bullock2011tagging, author = {Bullock, Beate Navarro and Jäschke, Robert and Hotho, Andreas}, booktitle = {Proceedings of the ACM WebSci'11}, interhash = {7afaa67dfeb07f7e0b85abf2be61aff1}, intrahash = {493e03868a98f498628cad31f9320e9f}, month = {June}, title = {Tagging data as implicit feedback for learning-to-rank}, url = {http://journal.webscience.org/463/}, year = 2011 } @misc{Sarma2011, abstract = { De-duplication---identification of distinct records referring to the same real-world entity---is a well-known challenge in data integration. Since very large datasets prohibit the comparison of every pair of records, {\em blocking} has been identified as a technique of dividing the dataset for pairwise comparisons, thereby trading off {\em recall} of identified duplicates for {\em efficiency}. Traditional de-duplication tasks, while challenging, typically involved a fixed schema such as Census data or medical records. However, with the presence of large, diverse sets of structured data on the web and the need to organize it effectively on content portals, de-duplication systems need to scale in a new dimension to handle a large number of schemas, tasks and data sets, while handling ever larger problem sizes. In addition, when working in a map-reduce framework it is important that canopy formation be implemented as a {\em hash function}, making the canopy design problem more challenging. We present CBLOCK, a system that addresses these challenges. CBLOCK learns hash functions automatically from attribute domains and a labeled dataset consisting of duplicates. Subsequently, CBLOCK expresses blocking functions using a hierarchical tree structure composed of atomic hash functions. The application may guide the automated blocking process based on architectural constraints, such as by specifying a maximum size of each block (based on memory requirements), impose disjointness of blocks (in a grid environment), or specify a particular objective function trading off recall for efficiency. As a post-processing step to automatically generated blocks, CBLOCK {\em rolls-up} smaller blocks to increase recall. We present experimental results on two large-scale de-duplication datasets at Yahoo!---consisting of over 140K movies and 40K restaurants respectively---and demonstrate the utility of CBLOCK. }, author = {Sarma, Anish Das and Jain, Ankur and Machanavajjhala, Ashwin and Bohannon, Philip}, interhash = {3f32848ef4bb26a3057c3feadff99c5a}, intrahash = {389dba4432b1340211ef6be8e3d45a1d}, note = {cite arxiv:1111.3689}, title = {CBLOCK: An Automatic Blocking Mechanism for Large-Scale De-duplication Tasks}, url = {http://arxiv.org/abs/1111.3689}, year = 2011 } @inproceedings{conf/semweb/TangHLL06, author = {Tang, Jie and Hong, MingCai and Li, Juan-Zi and Liang, Bangyong}, booktitle = {International Semantic Web Conference}, crossref = {conf/semweb/2006}, date = {2006-11-09}, editor = {Cruz, Isabel F. and Decker, Stefan and Allemang, Dean and Preist, Chris and Schwabe, Daniel and Mika, Peter and Uschold, Michael and Aroyo, Lora}, ee = {http://dx.doi.org/10.1007/11926078_46}, interhash = {0cd79ca123126fe66d0e2f2888222c79}, intrahash = {e378a25116a480b55e64a919a351f1a7}, isbn = {3-540-49029-9}, pages = {640-653}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, title = {Tree-Structured Conditional Random Fields for Semantic Annotation.}, url = {http://dblp.uni-trier.de/db/conf/semweb/iswc2006.html#TangHLL06}, volume = 4273, year = 2006 } @inproceedings{anti2008krause, address = {New York, NY, USA}, author = {Krause, Beate and Schmitz, Christoph and Hotho, Andreas and Stumme, Gerd}, booktitle = {AIRWeb '08: Proceedings of the 4th international workshop on Adversarial information retrieval on the web}, doi = {http://doi.acm.org/10.1145/1451983.1451998}, interhash = {a45d40ac7776551301ad9dde5b25357f}, intrahash = {68effe5d4b9460f9388e7685310f74c2}, isbn = {978-1-60558-159-0}, location = {Beijing, China}, pages = {61--68}, publisher = {ACM}, title = {The Anti-Social Tagger - Detecting Spam in Social Bookmarking Systems}, url = {http://airweb.cse.lehigh.edu/2008/submissions/krause_2008_anti_social_tagger.pdf}, year = 2008 } @incollection{books/mit/fayyadPSU96/FayyadPS96, author = {Fayyad, Usama M. and Piatetsky-Shapiro, Gregory and Smyth, Padhraic}, booktitle = {Advances in Knowledge Discovery and Data Mining}, date = {2002-01-03}, interhash = {79663e4b1f464b82ce1ae45345dc424f}, intrahash = {e59886c68d1fc9bb4d1a8d6a1a644a60}, pages = {1-34}, title = {From Data Mining to Knowledge Discovery: An Overview.}, url = {http://dblp.uni-trier.de/db/books/collections/fayyad96.html#FayyadPS96}, year = 1996 } @inproceedings{agrawal93, address = {New York, NY, USA}, author = {Agrawal, Rakesh and Imielinski, Tomasz and Swami, Arun}, booktitle = {SIGMOD '93: Proceedings of the 1993 ACM SIGMOD international conference on Management of data}, interhash = {53341ce3e6ce51c3bcf8b0219ec239b5}, intrahash = {ca35e602124130b480592b3a55267006}, pages = {207--216}, publisher = {ACM Press}, title = {Mining association rules between sets of items in large databases}, year = 1993 } @article{citeulike:2146554, abstract = {Nowadays, multi-label classification methods are increasingly required by modern applications, such as protein function classification, music categorization and semantic scene classification. This paper introduces the task of multi-label classification, organizes the sparse related literature into a structured presentation and performs comparative experimental results of certain multi-label classification methods. It also contributes the definition of concepts for the quantification of the multi-label nature of a data set.}, author = {Tsoumakas, G. and Katakis, I.}, citeulike-article-id = {2146554}, editor = {Taniar, David}, interhash = {f8e6c4b6b3df7461d070a1a9cc1d15c1}, intrahash = {52c3b18481f5146e4c213d609c1143fc}, journal = {International Journal of Data Warehouse and Mining}, number = 3, pages = {1--13}, posted-at = {2007-12-19 13:38:29}, priority = {2}, publisher = {Idea Group Publishing}, title = {Multi Label Classification: An Overview}, volume = 3, year = 2007 } @article{voelker2008aeon, abstract = {OntoClean is an approach towards the formal evaluation of taxonomic relations in ontologies. The application of OntoClean consists of two main steps. First, concepts are tagged according to meta-properties known as rigidity, unity, dependency and identity. Second, the tagged concepts are checked according to predefined constraints to discover taxonomic errors. Although OntoClean is well documented in numerous publications, it is still used rather infrequently due to the high costs of application. Especially, the manual tagging of concepts with the correct meta-properties requires substantial efforts of highly experienced ontology engineers. In order to facilitate the use of OntoClean and to enable the evaluation of real-world ontologies, we provide AEON, a tool which automatically tags concepts with appropriate OntoClean meta-properties and performs the constraint checking. We use the Web as an embodiment of world knowledge, where we search for patterns that indicate how to properly tag concepts. We thoroughly evaluated our approach against a manually created gold standard. The evaluation shows the competitiveness of our approach while at the same time significantly lowering the costs. All of our results, i.e. the tool AEON as well as the experiment data, are publicly available.}, address = {Amsterdam, The Netherlands, The Netherlands}, author = {Völker, Johanna and Vrandečić, Denny and Sure, York and Hotho, Andreas}, interhash = {f14794f4961d0127dc50c1938eaef7ea}, intrahash = {f8f0bb3e3495e7627770b470d1a5f1a3}, issn = {1570-5838}, journal = {Applied Ontology}, number = {1-2}, pages = {41--62}, publisher = {IOS Press}, title = {AEON - An approach to the automatic evaluation of ontologies}, url = {http://portal.acm.org/citation.cfm?id=1412422}, volume = 3, year = 2008 } @inproceedings{1458098, address = {New York, NY, USA}, author = {Song, Yang and Zhang, Lu and Giles, C. Lee}, booktitle = {CIKM '08: Proceeding of the 17th ACM conference on Information and knowledge mining}, doi = {http://doi.acm.org/10.1145/1458082.1458098}, interhash = {5c03bc1e658b6d44f053944418bdaec3}, intrahash = {d330a3537b4a14fbd40661424ec8e465}, isbn = {978-1-59593-991-3}, location = {Napa Valley, California, USA}, pages = {93--102}, publisher = {ACM}, title = {A sparse gaussian processes classification framework for fast tag suggestions}, url = {http://portal.acm.org/citation.cfm?id=1458098}, year = 2008 } @article{keyhere, abstract = {The identification of the user’s intention or interest through queries that they submit to a search engine can be very useful to offer them more adequate results. In this work we present a framework for the identification of user’s interest in an automaticway, based on the analysis of query logs. This identification is made from two perspectives, the objectives or goals of auser and the categories in which these aims are situated. A manual classification of the queries was made in order to havea reference point and then we applied supervised and unsupervised learning techniques. The results obtained show that fora considerable amount of cases supervised learning is a good option, however through unsupervised learning we found relationshipsbetween users and behaviors that are not easy to detect just taking the query words. Also, through unsupervised learning weestablished that there are categories that we are not able to determine in contrast with other classes that were not consideredbut naturally appear after the clustering process. This allowed us to establish that the combination of supervised and unsupervisedlearning is a good alternative to find user’s goals. From supervised learning we can identify the user interest given certainestablished goals and categories; on the other hand, with unsupervised learning we can validate the goals and categories used,refine them and select the most appropriate to the user’s needs.}, author = {Baeza-Yates, Ricardo and Calderón-Benavides, Liliana and González-Caro, Cristina}, interhash = {92e5f2f5208b5ce2f066dd361ae15758}, intrahash = {27c7357d3337d890fef53168dce9ed33}, journal = {String Processing and Information Retrieval}, pages = {98--109}, title = {The Intention Behind Web Queries}, url = {http://dx.doi.org/10.1007/11880561_9}, year = 2006 } @article{hotho2007mining, author = {Hotho, Andreas and Stumme, Gerd}, interhash = {26915a205b66368931821165ecaf972c}, intrahash = {92d3a5fdd786086fa12787e3e350b6af}, journal = {Künstliche Intelligenz}, number = 3, pages = {5-8}, title = {Mining the World Wide Web}, url = {http://kobra.bibliothek.uni-kassel.de/bitstream/urn:nbn:de:hebis:34-2008021320337/3/HothoStummeMiningWWW.pdf}, vgwort = {20}, year = 2007 } @article{bloehdorn2006intro, author = {Bloehdorn, S. and Buntine, W. and Hotho, A.}, date = {(2006)}, editor = {Bloehdorn, S. and Buntine, W. and Hotho, A.}, institution = {An International Journal of Computing and Informatics}, interhash = {4a3316f2f66fdfcb45c89bb872a82400}, intrahash = {e434232b8e3b80ff3b95006432fe54ee}, issn = {0350-5596}, journal = {Informatica}, number = 2, pages = {141-141}, title = {Introduction to the Special Issue 'Learning in Web Search'}, url = {http://www.informatica.si/PDF/30-2/00_Introduction.pdf}, volume = 30, year = 2006 } @proceedings{2005-lws-proceedings, editor = {Bloehdorn, Stephan and Buntine, Wray and Hotho, Andreas}, interhash = {940bd83a2753b14159690fde387e3909}, intrahash = {2de98c2b635f36c137e25256e8c235e0}, month = AUG, note = {Workshop at the 22nd International Conference on Machine Learning (ICML 2005) }, title = {Proceedings of the Workshop on Learning in Web Search (LWS 2005) }, url = {http://cosco.hiit.fi/search/learninginsearch05/ICML_W4.pdf}, year = 2005 } @book{thrun2001, asin = {0262201623}, author = {Thrun, Sebastian and Burgard, Wolfram and Fox, Dieter}, interhash = {f1b5ed6b916d753f4e24749056ab745e}, intrahash = {914a56f048c863f0928bb6d1efe09ff7}, isdn = {978-0-262-20162-9}, title = {Probabilistic Robotics (Intelligent Robotics and Autonomous Agents)}, typesource = {Simple CitationSource}, url = {http://www.amazon.com/Probabilistic-Robotics-Intelligent-Autonomous-Agents/dp/0262201623/ref=sr_11_1/105-3361811-4085215?ie=UTF8&qid=1190743235&sr=11-1}, year = 2001 } @book{buitelaar05ontologylearningbook, editor = {Buitelaar, Paul and Cimiano, Philipp and Magnini, Bernardo}, interhash = {9a5beec1eb7d58ead91f134915be86ab}, intrahash = {0e71ddd52894af0e681b9d9411f7944f}, month = JUL, publisher = {IOS Press}, series = {Frontiers in Artificial Intelligence}, title = {Ontology Learning from Text: Methods, Evaluation and Applications}, volume = 123, year = 2005 } @inbook{baldi03modelling, abstract = {Modeling the Internet and the Web covers the most important aspects of modeling the Web using a modern mathematical and probabilistic treatment. It focuses on the information and application layers, as well as some of the emerging properties of the Internet.  Provides a comprehensive introduction to the modeling of the Internet and the Web at the information level.  Takes a modern approach based on mathematical, probabilistic, and graphical modeling.  Provides an integrated presentation of theory, examples, exercises and applications.  Covers key topics such as text analysis, link analysis, crawling techniques, human behaviour, and commerce on the Web. Interdisciplinary in nature, Modeling the Internet and the Web will be of interest to students and researchers from a variety of disciplines including computer science, machine learning, engineering, statistics, economics, business, and the social sciences.}, author = {Baldi, Pierre and Frasconi, Paolo and Smyth, Padhraic}, booktitle = {Modeling the Internet and the Web: Probabilistic Methods and Algorithms}, citeulike-article-id = {822915}, interhash = {416f2405193ae7d30cffe673dee89df2}, intrahash = {3e4e2899e7d6988218d02a264bcfe24a}, month = {April}, priority = {2}, publisher = {Wiley}, title = {Modeling the Internet and the Web: Probabilistic Methods and Algorithms}, url = {http://eu.wiley.com/WileyCDA/WileyTitle/productCd-0470849061.html}, year = 2003 } @article{wikipediaxml:2005, author = {Denoyer, Ludovic and Gallinari, Patrick}, interhash = {0e9b9afb15804d3e625d73ada85900b1}, intrahash = {493b849942fcaf9ba8e8e68e3cb46d38}, journal = {SIGIR Forum}, title = {{T}he {W}ikipedia {X}{M}{L} {C}orpus}, url = {http://www-connex.lip6.fr/~denoyer/wikipediaXML/}, year = 2006 } @book{Qui93, author = {Quinlan, J. R.}, interhash = {1a265267f55efc59cd96ecb93a69b520}, intrahash = {da2798a9bd21fd49a31dde24cb605b1a}, isbn = {1558602380}, publisher = {Morgan Kaufmann, California}, title = {{C4.5 Programs for Machine Learning}}, year = 1993 } @article{Sebastiani02, author = {Sebastiani, F.}, bb-further-address = {--Dordrecht--London}, interhash = {d945d9218673dad37dc2a06cbf9e554c}, intrahash = {0fe0d5dd12c2cb59dfc330e684ec4b4a}, journal = {ACM Computing Surveys}, number = 1, pages = {1--47}, title = {Machine learning in automated text categorization}, url = {http://faure.iei.pi.cnr.it/~fabrizio/Publications/ACMCS02.pdf}, volume = 34, year = 2002 }