@article{liu2012crowdsourcing, abstract = {Some complex problems, such as image tagging and natural language processing, are very challenging for computers, where even state-of-the-art technology is yet able to provide satisfactory accuracy. Therefore, rather than relying solely on developing new and better algorithms to handle such tasks, we look to the crowdsourcing solution -- employing human participation -- to make good the shortfall in current technology. Crowdsourcing is a good supplement to many computer tasks. A complex job may be divided into computer-oriented tasks and human-oriented tasks, which are then assigned to machines and humans respectively.

To leverage the power of crowdsourcing, we design and implement a Crowdsourcing Data Analytics System, CDAS. CDAS is a framework designed to support the deployment of various crowdsourcing applications. The core part of CDAS is a quality-sensitive answering model, which guides the crowdsourcing engine to process and monitor the human tasks. In this paper, we introduce the principles of our quality-sensitive model. To satisfy user required accuracy, the model guides the crowdsourcing query engine for the design and processing of the corresponding crowdsourcing jobs. It provides an estimated accuracy for each generated result based on the human workers' historical performances. When verifying the quality of the result, the model employs an online strategy to reduce waiting time. To show the effectiveness of the model, we implement and deploy two analytics jobs on CDAS, a twitter sentiment analytics job and an image tagging job. We use real Twitter and Flickr data as our queries respectively. We compare our approaches with state-of-the-art classification and image annotation techniques. The results show that the human-assisted methods can indeed achieve a much higher accuracy. By embedding the quality-sensitive model into crowdsourcing query engine, we effectively reduce the processing cost while maintaining the required query answer quality.}, acmid = {2336676}, author = {Liu, Xuan and Lu, Meiyu and Ooi, Beng Chin and Shen, Yanyan and Wu, Sai and Zhang, Meihui}, interhash = {41ad6e73b03373d76d3164ba248335d7}, intrahash = {2091967734f96c4afbc09319d48a8c65}, issn = {2150-8097}, issue_date = {June 2012}, journal = {Proceedings of the VLDB Endowment}, month = jun, number = 10, numpages = {12}, pages = {1040--1051}, publisher = {VLDB Endowment}, title = {CDAS: a crowdsourcing data analytics system}, url = {http://dl.acm.org/citation.cfm?id=2336664.2336676}, volume = 5, year = 2012 } @article{rauber2009webarchivierung, abstract = { In den letzten Jahren haben Bibliotheken und Archive zunehmend die Aufgabe übernommen, neben konventionellen Publikationen auch Inhalte aus dem World Wide Web zu sammeln, um so diesen wertvollen Teil unseres kulturellen Erbes zu bewahren und wichtige Informationen langfristig verfügbar zu halten. Diese massiven Datensammlungen bieten faszinierende Möglichkeiten, rasch Zugriff auf wichtige Informationen zu bekommen, die im Live-Web bereits verloren gegangen sind. Sie sind eine unentbehrliche Quelle für Wissenschaftler, die in der Zukunft die gesellschaftliche und technologische Entwicklung unserer Zeit nachvollziehen wollen. Auf der anderen Seite stellt eine derartige Datensammlung aber einen völlig neuen Datenbestand dar, der nicht nur rechtliche, sondern auch zahlreiche ethische Fragen betreffend seine Nutzung aufwirft. Diese werden in dem Ausmaß zunehmen, in dem die technischen Möglichkeiten zur automatischen Analyse und Interpretation dieser Daten leistungsfähiger werden. Da sich die meisten Webarchivierungsinitiativen dieser Problematik bewusst sind, bleibt die Nutzung der Daten derzeit meist stark eingeschränkt, oder es wird eine Art von "Opt-Out"-Möglichkeit vorgesehen, wodurch Webseiteninhaber die Aufnahme ihrer Seiten in ein Webarchiv ausschließen können. Mit beiden Ansätzen können Webarchive ihr volles Nutzungspotenzial nicht ausschöpfen. Dieser Artikel beschreibt einleitend kurz die Technologien, die zur Sammlung von Webinhalten zu Archivierungszwecken verwendet werden. Er hinterfragt Annahmen, die die freie Verfügbarkeit der Daten und unterschiedliche Nutzungsarten betreffen. Darauf aufbauend identifiziert er eine Reihe von offenen Fragen, deren Lösung einen breiteren Zugriff und bessere Nutzung von Webarchiven erlauben könnte. }, author = {Rauber, Andreas and Kaiser, Max}, editor = {Knoll, Matthias and Meier, Andreas}, interhash = {3b35b676a2817868d93481aeebfa4154}, intrahash = {cdaef18169a7d8300cf54daf018a74cc}, issn = {1436-3011}, journal = {HMD Praxis der Wirtschaftsinformatik}, month = aug, publisher = {dpunkt.verlag}, title = {Webarchivierung und Web Archive Mining: Notwendigkeit, Probleme und Lösungsansätze}, url = {http://hmd.dpunkt.de/268/03.php}, volume = 268, year = 2009 } @book{tufte2001visual, asin = {0961392142}, author = {Tufte, Edward R.}, dewey = {001.4226}, ean = {9780961392147}, edition = {Second}, interhash = {9900880e451150c1b06ede3c780c062b}, intrahash = {9c028ebcb336380cb02e2a4beaa14d54}, isbn = {0961392142}, publisher = {Graphics Press}, title = {The Visual Display of Quantitative Information}, url = {http://www.amazon.com/Visual-Display-Quantitative-Information-2nd/dp/0961392142%3FSubscriptionId%3D192BW6DQ43CK9FN0ZGG2%26tag%3Dws%26linkCode%3Dxm2%26camp%3D2025%26creative%3D165953%26creativeASIN%3D0961392142}, year = 2001 } @article{pang2008opinion, abstract = {An important part of our information-gathering behavior has always been to find out what other people think. With the growing availability and popularity of opinion-rich resources such as online review sites and personal blogs, new opportunities and challenges arise as people now can, and do, actively use information technologies to seek out and understand the opinions of others. The sudden eruption of activity in the area of opinion mining and sentiment analysis, which deals with the computational treatment of opinion, sentiment, and subjectivity in text, has thus occurred at least in part as a direct response to the surge of interest in new systems that deal directly with opinions as a first-class object.}, address = {Hanover, MA, USA}, author = {Pang, Bo and Lee, Lillian}, doi = {10.1561/1500000011}, interhash = {7bfd8b20ea5f9fb76e96d71c3155c50c}, intrahash = {4d0e1a6268b3d8a119aaf2b0c2cb5154}, issn = {1554-0669}, journal = {Foundations and Trends in Information Retrieval}, month = jan, number = {1-2}, pages = {1--135}, publisher = {Now Publishers Inc.}, title = {Opinion Mining and Sentiment Analysis}, url = {http://portal.acm.org/citation.cfm?id=1454712}, volume = 2, year = 2008 } @incollection{fayyad1996data, abstract = {Data mining and knowledge discovery in databases have been attracting a significant amount of research, industry, and media attention of late. What is all the excitement about? This article provides an overview of this emerging field, clarifying how data mining and knowledge discovery in databases are related both to each other and to related fields, such as machine learning, statistics, and databases. The article mentions particular real-world applications, specific data-mining techniques, challenges involved in real-world applications of knowledge discovery, and current and future research directions in the field.}, address = {Menlo Park, CA, USA}, author = {Fayyad, Usama M. and Piatetsky-Shapiro, Gregory and Smyth, Padhraic}, booktitle = {Advances in knowledge discovery and data mining}, editor = {Fayyad, Usama M. and Piatetsky-Shapiro, Gregory and Smyth, Padhraic and Uthurusamy, Ramasamy}, interhash = {79663e4b1f464b82ce1ae45345dc424f}, intrahash = {3f5a400d01a974f993cee1ac5f79cfc8}, isbn = {0-262-56097-6}, pages = {1--34}, publisher = {American Association for Artificial Intelligence}, title = {From data mining to knowledge discovery: an overview}, url = {http://portal.acm.org/citation.cfm?id=257942}, year = 1996 } @article{zhang2002web, abstract = {Web usage mining can be very useful to search engines. This paper proposes a novel effective approach to exploit the relationships among users, queries and resources based on the search engine's log. How this method can be applied is illustrated by a Chinese image search engine.}, author = {Zhang, Dell and Dong, Yisheng}, doi = {10.1016/S1389-1286(02)00211-6}, editor = {Akyildiz, Ian F. and Rudin, Harry}, interhash = {a38a0c8a8ffc3f7afd62215a57989d63}, intrahash = {3a5f9c847318543dbf32b434656d8065}, issn = {1389-1286}, journal = {Computer Networks}, month = jun, number = 3, pages = {303--310}, publisher = {Elsevier}, title = {A novel Web usage mining approach for search engines}, url = {http://www.sciencedirect.com/science/article/B6VRG-45H0GV7-5/2/16726cebdcde67ba7aeb95cc91e797bf}, volume = 39, year = 2002 } @article{romero07, abstract = {Currently there is an increasing interest in data mining and educational systems, making educational data mining as a new growing research community. This paper surveys the application of data mining to traditional educational systems, particular web-based courses, well-known learning content management systems, and adaptive and intelligent web-based educational systems. Each of these systems has different data source and objectives for knowledge discovering. After preprocessing the available data in each case, data mining techniques can be applied: statistics and visualization; clustering, classification and outlier detection; association rule mining and pattern mining; and text mining. The success of the plentiful work needs much more specialized work in order for educational data mining to become a mature area.}, address = {Tarrytown, NY, USA}, author = {Romero, C. and Ventura, S.}, doi = {http://dx.doi.org/10.1016/j.eswa.2006.04.005}, interhash = {89d843f1a3b181f2a628e881d9210b22}, intrahash = {746d12e92e58587461ffcb8dc381e283}, issn = {0957-4174}, journal = {Expert Syst. Appl.}, number = 1, pages = {135--146}, publisher = {Pergamon Press, Inc.}, title = {Educational data mining: A survey from 1995 to 2005}, url = {http://portal.acm.org/citation.cfm?id=1223659}, volume = 33, year = 2007 } @inproceedings{orlando02efficient, abstract = {Due to the huge increase in the number and dimension of available databases, efficient solutions for counting frequent sets are nowadays very important within the Data Mining community. Several sequential and parallel algorithms were proposed, whichin many cases exhibit excellent scalability. In this paper we present ParDCI, a distributed and multithreaded algorithm forcounting the occurrences of frequent sets within transactional databases. ParDCI is a parallel version of DCI (Direct Count& Intersect), a multi-strategy algorithm which is able to adapt its behavior not only to the features of the specific computingplatform (e.g. available memory), but also to the features of the dataset being processed (e.g. sparse or dense datasets).ParDCI enhances previous proposals by exploiting the highly optimized counting and intersection techniques of DCI, and byrelying on a multi-level parallelization approachwh ichex plicitly targets clusters of SMPs, an emerging computing platform.We focused our work on the efficient exploitation of the underlying architecture. Intra-Node multithreading effectively exploitsthe memory hierarchies of each SMP node, while Inter-Node parallelism exploits smart partitioning techniques aimed at reducingcommunication overheads. In depth experimental evaluations demonstrate that ParDCI reaches nearly optimal performances undera variety of conditions.}, author = {Orlando, Salvatore and Palmerini, Paolo and Perego, Raffaele and Silvestri, Fabrizio}, booktitle = {High Performance Computing for Computational Science — VECPAR 2002}, interhash = {50c17d100341c01892f7dd8fbd7deb69}, intrahash = {522c68b8bb5e28f1bf9f1e11e612f542}, pages = {3--29}, title = {An Efficient Parallel and Distributed Algorithm for Counting Frequent Sets}, url = {http://dx.doi.org/10.1007/3-540-36569-9_28}, year = 2003 } @article{tkde06, author = {Lucchese, Claudio and Orlando, Salvatore and Perego, Raffaele}, interhash = {33820f389bc1f6bdb96f5a8f925df879}, intrahash = {3aff1098bf9828a0c6683f07145d60bb}, journal = {IEEE Transactions On Knowledge and Data Engineering}, number = 1, pages = {21--36}, title = {Fast and Memory Efficient Mining of Frequent Closed Itemsets}, volume = 18, year = 2006 } @article{wu2008wu, abstract = {This paper presents the top 10 data mining algorithms identified by the IEEE International Conference on Data Mining (ICDM) in December 2006: C4.5, k-Means, SVM, Apriori, EM, PageRank, AdaBoost, kNN, Naive Bayes, and CART. These top 10 algorithms are among the most influential data mining algorithms in the research community.With each algorithm, we provide a description of the algorithm, discuss the impact of the algorithm, and review current andfurther research on the algorithm. These 10 algorithms cover classification, clustering, statistical learning, associationanalysis, and link mining, which are all among the most important topics in data mining research and development.}, address = {London}, author = {Wu, Xindong and Kumar, Vipin and Quinlan, J. Ross and Ghosh, Joydeep and Yang, Qiang and Motoda, Hiroshi and McLachlan, Geoffrey and Ng, Angus and Liu, Bing and Yu, Philip and Zhou, Zhi-Hua and Steinbach, Michael and Hand, David and Steinberg, Dan}, interhash = {76fd294a34cf85638f6e194a85af8db9}, intrahash = {2c34bb4b49187a6d3e780e78d254ae1f}, issn = {0219-1377}, journal = {Knowledge and Information Systems}, month = Jan, number = 1, pages = {1--37}, publisher = {Springer}, title = {Top 10 algorithms in data mining}, url = {http://dx.doi.org/10.1007/s10115-007-0114-2}, volume = 14, year = 2008 }