@article{morstatter2013sample, author = {Morstatter, Fred and {\"u}rgen Pfeffer, J and Liu, Huan and Carley, Kathleen M}, interhash = {bca742d25a5f5fa43c8f106460449b5b}, intrahash = {58707a28cc5098b9b3444501d5ca9a88}, title = {Is the Sample Good Enough? Comparing Data from Twitter’s Streaming API with Twitter’s Firehose}, url = {http://scholar.google.de/scholar.bib?q=info:NkS2afIrqyQJ:scholar.google.com/&output=citation&hl=de&as_sdt=0,5&ct=citation&cd=0}, year = 2013 } @inproceedings{coates2011analysis, abstract = {A great deal of research has focused on algorithms for learning features from unlabeled data. Indeed, much progress has been made on benchmark datasets like NORB and CIFAR-10 by employing increasingly complex unsupervised learning algorithms and deep models. In this paper, however, we show that several simple factors, such as the number of hidden nodes in the model, may be more important to achieving high performance than the learning algorithm or the depth of the model. Specifically, we will apply several off-the-shelf feature learning algorithms (sparse auto-encoders, sparse RBMs, K-means clustering, and Gaussian mixtures) to CIFAR-10, NORB, and STL datasets using only single-layer networks. We then present a detailed analysis of the effect of changes in the model setup: the receptive field size, number of hidden nodes (features), the step-size ("stride") between extracted features, and the effect of whitening. Our results show that large numbers of hidden nodes and dense feature extraction are critical to achieving high performance - so critical, in fact, that when these parameters are pushed to their limits, we achieve state-of-the-art performance on both CIFAR-10 and NORB using only a single layer of features. More surprisingly, our best performance is based on K-means clustering, which is extremely fast, has no hyper-parameters to tune beyond the model structure itself, and is very easy to implement. Despite the simplicity of our system, we achieve accuracy beyond all previously published results on the CIFAR-10 and NORB datasets (79.6% and 97.2% respectively).}, author = {Coates, A. and Lee, H. and Ng, A.Y.}, booktitle = {Proceedings of the Fourteenth International Conference on Artificial Intelligence and Statistics}, editor = {Gordon, Geoffrey and Dunson, David and Dudík, Miroslav}, interhash = {46cfb4b5b1c16c79a966512e07f67158}, intrahash = {bcb2c1fd335ae57362cdf348ff727589}, pages = {215--223}, publisher = {JMLR W\&CP}, series = {JMLR Workshop and Conference Proceedings}, title = {An analysis of single-layer networks in unsupervised feature learning}, url = {http://jmlr.csail.mit.edu/proceedings/papers/v15/coates11a.html}, volume = 15, year = 2011 } @inproceedings{coates2011detection, abstract = {Reading text from photographs is a challenging problem that has received a significant amount of attention. Two key components of most systems are (i) text detection from images and (ii) character recognition, and many recent methods have been proposed to design better feature representations and models for both. In this paper, we apply methods recently developed in machine learning -- specifically, large-scale algorithms for learning the features automatically from unlabeled data -- and show that they allow us to construct highly effective classifiers for both detection and recognition to be used in a high accuracy end-to-end system.}, author = {Coates, A. and Carpenter, B. and Case, C. and Satheesh, S. and Suresh, B. and Wang, Tao and Wu, D.J. and Ng, A.Y.}, booktitle = {International Conference on Document Analysis and Recognition (ICDAR)}, doi = {10.1109/ICDAR.2011.95}, interhash = {adb17817e5f95605a8066737ce0e8b7e}, intrahash = {b550ca5ec5a8b61b64b17091f7b2eeab}, issn = {1520-5363}, month = sep, pages = {440--445}, title = {Text Detection and Character Recognition in Scene Images with Unsupervised Feature Learning}, url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=6065350&tag=1}, year = 2011 } @incollection{leake2000casebased, abstract = {Case-based reasoning(CBR) is an artificial intelligence paradigm for reasoning and learning. Case-based reasoning solves new problems by retrieving stored records of prior problem-solving episodes (cases) and adapting their solutions to fit new circumstances. Each processing episode provides a new case that is stored for future reuse, making learning a natural side-effect of the reasoning process. Case-based reasoning is also studied within cognitive science as a model of human reasoning: studies show that people use recollections of prior problems to guide their reasoning in a wide range of tasks, such as programming, mathematical problem solving, diagnosis, decision making, and design.}, acmid = {1074199}, address = {Chichester, UK}, author = {Leake, David B.}, booktitle = {Encyclopedia of Computer Science}, edition = {4th}, editor = {Ralston, Anthony and Reilly, Edwin D. and Hemmendinger, David}, interhash = {fa414e2f48be14bb94cbfbf2566e36af}, intrahash = {b8526b7c03f1fc9bdd85863dfbf881a2}, isbn = {0-470-86412-5}, month = jun, numpages = {2}, pages = {196--197}, publisher = {John Wiley and Sons Ltd.}, title = {Case-based reasoning}, url = {http://dl.acm.org/citation.cfm?id=1074100.1074199}, year = 2000 } @article{raykar2010learning, abstract = {For many supervised learning tasks it may be infeasible (or very expensive) to obtain objective and reliable labels. Instead, we can collect subjective (possibly noisy) labels from multiple experts or annotators. In practice, there is a substantial amount of disagreement among the annotators, and hence it is of great practical interest to address conventional supervised learning problems in this scenario. In this paper we describe a probabilistic approach for supervised learning when we have multiple annotators providing (possibly noisy) labels but no absolute gold standard. The proposed algorithm evaluates the different experts and also gives an estimate of the actual hidden labels. Experimental results indicate that the proposed method is superior to the commonly used majority voting baseline.}, acmid = {1859894}, author = {Raykar, Vikas C. and Yu, Shipeng and Zhao, Linda H. and Valadez, Gerardo Hermosillo and Florin, Charles and Bogoni, Luca and Moy, Linda}, interhash = {8113daf47997fddf48e4c6c79f2eba56}, intrahash = {14220abe8babfab01c0cdd5ebd5e4b7c}, issn = {1532-4435}, issue_date = {3/1/2010}, journal = {Journal of Machine Learning Research}, month = aug, numpages = {26}, pages = {1297--1322}, publisher = {JMLR.org}, title = {Learning From Crowds}, url = {http://dl.acm.org/citation.cfm?id=1756006.1859894}, volume = 11, year = 2010 } @inproceedings{bullock2011tagging, author = {Bullock, Beate Navarro and Jäschke, Robert and Hotho, Andreas}, booktitle = {Proceedings of the ACM WebSci'11}, interhash = {7afaa67dfeb07f7e0b85abf2be61aff1}, intrahash = {493e03868a98f498628cad31f9320e9f}, month = {June}, title = {Tagging data as implicit feedback for learning-to-rank}, url = {http://journal.webscience.org/463/}, year = 2011 } @article{themenheft2007webmining, author = {Hotho, Andreas and Stumme, Gerd}, interhash = {39f94bf3a1663d9cec6a6cb8354a9bd9}, intrahash = {e9535ec82afa53f44a1b37704aa9a71f}, journal = {Künstliche Intelligenz}, number = 3, pages = {5-8}, title = {Mining the World Wide Web -- Methods, Ap- plications, and Perspectives}, url = {http://www.kuenstliche-intelligenz.de/index.php?id=7758}, year = 2007 } @proceedings{themenheft2007webmining, editor = {Hotho, Andreas and Stumme, Gerd}, interhash = {83c28b86f2ac897e906660e54e6fffc0}, intrahash = {c73311bb72ad480d74125dbc9d94c450}, journal = {Künstliche Intelligenz}, number = 3, pages = {5-8}, title = {Themenheft Web Mining, Künstliche Intelligenz}, url = {http://www.kuenstliche-intelligenz.de/index.php?id=7758}, year = 2007 } @misc{Sarma2011, abstract = { De-duplication---identification of distinct records referring to the same real-world entity---is a well-known challenge in data integration. Since very large datasets prohibit the comparison of every pair of records, {\em blocking} has been identified as a technique of dividing the dataset for pairwise comparisons, thereby trading off {\em recall} of identified duplicates for {\em efficiency}. Traditional de-duplication tasks, while challenging, typically involved a fixed schema such as Census data or medical records. However, with the presence of large, diverse sets of structured data on the web and the need to organize it effectively on content portals, de-duplication systems need to scale in a new dimension to handle a large number of schemas, tasks and data sets, while handling ever larger problem sizes. In addition, when working in a map-reduce framework it is important that canopy formation be implemented as a {\em hash function}, making the canopy design problem more challenging. We present CBLOCK, a system that addresses these challenges. CBLOCK learns hash functions automatically from attribute domains and a labeled dataset consisting of duplicates. Subsequently, CBLOCK expresses blocking functions using a hierarchical tree structure composed of atomic hash functions. The application may guide the automated blocking process based on architectural constraints, such as by specifying a maximum size of each block (based on memory requirements), impose disjointness of blocks (in a grid environment), or specify a particular objective function trading off recall for efficiency. As a post-processing step to automatically generated blocks, CBLOCK {\em rolls-up} smaller blocks to increase recall. We present experimental results on two large-scale de-duplication datasets at Yahoo!---consisting of over 140K movies and 40K restaurants respectively---and demonstrate the utility of CBLOCK. }, author = {Sarma, Anish Das and Jain, Ankur and Machanavajjhala, Ashwin and Bohannon, Philip}, interhash = {3f32848ef4bb26a3057c3feadff99c5a}, intrahash = {389dba4432b1340211ef6be8e3d45a1d}, note = {cite arxiv:1111.3689}, title = {CBLOCK: An Automatic Blocking Mechanism for Large-Scale De-duplication Tasks}, url = {http://arxiv.org/abs/1111.3689}, year = 2011 } @article{griffiths2004finding, author = {Griffiths, Thomas L. and Steyvers, Mark}, interhash = {387a5060792d52ea73b02dd68e52559e}, intrahash = {cbfda2e50bd63357890b9181d8883826}, title = {Finding scientific topics}, url = {http://www.pnas.org/cgi/content/abstract/101/suppl_1/5228}, year = 2004 } @inproceedings{conf/semweb/TangHLL06, author = {Tang, Jie and Hong, MingCai and Li, Juan-Zi and Liang, Bangyong}, booktitle = {International Semantic Web Conference}, crossref = {conf/semweb/2006}, date = {2006-11-09}, editor = {Cruz, Isabel F. and Decker, Stefan and Allemang, Dean and Preist, Chris and Schwabe, Daniel and Mika, Peter and Uschold, Michael and Aroyo, Lora}, ee = {http://dx.doi.org/10.1007/11926078_46}, interhash = {0cd79ca123126fe66d0e2f2888222c79}, intrahash = {e378a25116a480b55e64a919a351f1a7}, isbn = {3-540-49029-9}, pages = {640-653}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, title = {Tree-Structured Conditional Random Fields for Semantic Annotation.}, url = {http://dblp.uni-trier.de/db/conf/semweb/iswc2006.html#TangHLL06}, volume = 4273, year = 2006 } @article{bishop1998latent, author = {Bishop, C.M.}, interhash = {3d556e46becabe2ec132942af1409079}, intrahash = {8c95ee19fc4410c654b624cb9bd6ff57}, journal = {Learning in graphical models}, title = {{Latent variable models}}, url = {http://scholar.google.de/scholar.bib?q=info:fwgN--1AHXsJ:scholar.google.com/&output=citation&hl=de&ct=citation&cd=0}, year = 1998 } @book{jordan-learning-98, editor = {Jordan, M.}, interhash = {dca14c475ead34e75711dfe8bb911d96}, intrahash = {101d8938173add30b69dd1f4872e6eb7}, publisher = {MIT Press}, title = {Learning in Graphical Models}, year = 1998 } @book{cowell1998advanced, author = {Cowell, R.}, interhash = {fe438e1412e694bba0969bc7f99310a6}, intrahash = {aa27d5a4998c8c6967049cb99c5bd40e}, publisher = {Learning in Graphical Models. MIT Press}, title = {{Advanced inference in Bayesian networks}}, url = {http://scholar.google.de/scholar.bib?q=info:PZ3Aqxv-3FgJ:scholar.google.com/&output=citation&hl=de&ct=citation&cd=0}, year = 1998 } @book{gilks1996markov, author = {Gilks, W.R. and Spiegelhalter, DJ}, interhash = {152f39e8e21e5da1545e74b32b6c4e76}, intrahash = {5c410a2edd204cd117776f6d7f2fea5f}, publisher = {Chapman \& Hall/CRC}, title = {{Markov chain Monte Carlo in practice}}, url = {http://scholar.google.de/scholar.bib?q=info:AN5YKWErdFAJ:scholar.google.com/&output=citation&hl=de&ct=citation&cd=0}, year = 1996 } @article{Tierney94, author = {Tierney, L.}, interhash = {61cadfbd70e58e20f9b218a6b6c747e7}, intrahash = {b1bdabb2b26068df271283a8d6c37419}, journal = {The Annals of Statistics}, pages = {1701-1727}, title = {Markov chains for exploring posterior distributions}, volume = {22(4)}, year = 1994 } @article{casella1992, abstract = {Computer-intensive algorithms, such as the Gibbs sampler, have become increasingly popular statistical tools, both in applied and theoretical work. The properties of such algorithms, however, may sometimes not be obvious. Here we give a simple explanation of how and why the Gibbs sampler works. We analytically establish its properties in a simple case and provide insight for more complicated cases. There are also a number of examples.}, author = {Casella, George and George, Edward I.}, citeulike-article-id = {1270229}, citeulike-linkout-0 = {http://dx.doi.org/10.2307/2685208}, citeulike-linkout-1 = {http://www.jstor.org/stable/2685208}, doi = {10.2307/2685208}, interhash = {ba4f08a9e4e1add859c3b2c9661728fa}, intrahash = {d9ef3231e2903c2f5bc2ef565f87f882}, issn = {00031305}, journal = {The American Statistician}, number = 3, pages = {167--174}, posted-at = {2009-09-24 05:52:36}, priority = {2}, publisher = {American Statistical Association}, title = {Explaining the Gibbs Sampler}, url = {http://dx.doi.org/10.2307/2685208}, volume = 46, year = 1992 } @article{Buntine94operationsfor, abstract = {This paper is a multidisciplinary review of empirical, statistical learning from a graphical model perspective. Well-known examples of graphical models include Bayesian networks, directed graphs representing a Markov chain, and undirected networks representing a Markov field. These graphical models are extended to model data analysis and empirical learning using the notation of plates. Graphical operations for simplifying and manipulating a problem are provided including decomposition, differentiation, and the manipulation of probability models from the exponential family. Two standard algorithm schemas for learning are reviewed in a graphical framework: Gibbs sampling and the expectation maximization algorithm. Using these operations and schemas, some popular algorithms can be synthesized from their graphical specification. This includes versions of linear regression, techniques for feed-forward networks, and learning Gaussian and discrete Bayesian networks from data. The paper conclu...}, author = {Buntine, Wray L.}, interhash = {c7dd650780467c934551356630a7b739}, intrahash = {8952cf0d215116e038971f7c30d6d19d}, journal = {Journal of Artificial Intelligence Research}, pages = {159--225}, title = {Operations for Learning with Graphical Models}, url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.52.696}, volume = 2, year = 1994 } @article{themenheft2007webmining, author = {Hotho, Andreas and Stumme, Gerd}, interhash = {39f94bf3a1663d9cec6a6cb8354a9bd9}, intrahash = {e9535ec82afa53f44a1b37704aa9a71f}, journal = {Künstliche Intelligenz}, number = 3, pages = {5-8}, title = {Mining the World Wide Web -- Methods, Ap- plications, and Perspectives}, url = {http://www.kuenstliche-intelligenz.de/index.php?id=7758}, year = 2007 } @proceedings{themenheft2007webmining, editor = {Hotho, Andreas and Stumme, Gerd}, interhash = {83c28b86f2ac897e906660e54e6fffc0}, intrahash = {c73311bb72ad480d74125dbc9d94c450}, journal = {Künstliche Intelligenz}, number = 3, pages = {5-8}, title = {Themenheft Web Mining, Künstliche Intelligenz}, url = {http://www.kuenstliche-intelligenz.de/index.php?id=7758}, year = 2007 }