@inproceedings{conf/wsdm/KohlschutterFN10,
  author = {Kohlschütter, Christian and Fankhauser, Peter and Nejdl, Wolfgang},
  booktitle = {Proc. of 3rd ACM International Conference on Web Search and Data Mining New York City, NY USA (WSDM 2010).},
  interhash = {25ea118166ef2f0d5597ca90fa702c9d},
  intrahash = {dbc8464d9a298afa49d607d65f2160e2},
  title = {Boilerplate Detection using Shallow Text Features},
  year = 2010
}

@inproceedings{conf/www/SinhaSSMEHW15,
  author = {Sinha, Arnab and Shen, Zhihong and Song, Yang and Ma, Hao and Eide, Darrin and Hsu, Bo-June Paul and Wang, Kuansan},
  booktitle = {WWW (Companion Volume)},
  crossref = {conf/www/2015c},
  editor = {Gangemi, Aldo and Leonardi, Stefano and Panconesi, Alessandro},
  ee = {http://doi.acm.org/10.1145/2740908.2742839},
  interhash = {6d71a6eb1d070023f6fb75a5f1019a21},
  intrahash = {e6066395c31b2f3de9fb836dbac5723a},
  isbn = {978-1-4503-3473-0},
  pages = {243-246},
  publisher = {ACM},
  title = {An Overview of Microsoft Academic Service (MAS) and Applications.},
  url = {http://dblp.uni-trier.de/db/conf/www/www2015c.html#SinhaSSMEHW15},
  year = 2015
}

@article{mnih2015humanlevel,
  author = {Mnih, Volodymyr and Kavukcuoglu, Koray and Silver, David and Rusu, Andrei A. and Veness, Joel and Bellemare, Marc G. and Graves, Alex and Riedmiller, Martin and Fidjeland, Andreas K. and Ostrovski, Georg and Petersen, Stig and Beattie, Charles and Sadik, Amir and Antonoglou, Ioannis and King, Helen and Kumaran, Dharshan and Wierstra, Daan and Legg, Shane and Hassabis, Demis},
  interhash = {eac59980357d99db87b341b61ef6645f},
  intrahash = {fb15f4471c81dc2b9edf2304cb2f7083},
  issn = {00280836},
  journal = {Nature},
  month = feb,
  number = 7540,
  pages = {529--533},
  publisher = {Nature Publishing Group, a division of Macmillan Publishers Limited. All Rights Reserved.},
  title = {Human-level control through deep reinforcement learning},
  url = {http://dx.doi.org/10.1038/nature14236},
  volume = 518,
  year = 2015
}

@inproceedings{tran2015semantic,
  abstract = {In this paper we study the problem of semantic annotation for a trending hashtag which is the crucial step towards analyzing user behavior in social media, yet has been largely unexplored. We tackle the problem via linking to entities from Wikipedia. We incorporate the social aspects of trending hashtags by identifying prominent entities for the annotation so as to maximize the information spreading in entity networks. We exploit temporal dynamics of entities in Wikipedia, namely Wikipedia edits and page views to improve the annotation quality. Our experiments show that we significantly outperform the established methods in tweet annotation.},
  author = {Tran, Tuan and Tran, Nam-Khanh and Teka Hadgu, Asmelash and Jäschke, Robert},
  booktitle = {Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing (EMNLP)},
  interhash = {4156275c801376fa64dfdb69a4ce60c4},
  intrahash = {9d4cd9070922e1eb43bcab1da4a9d840},
  month = sep,
  publisher = {Association for Computational Linguistics},
  title = {Semantic Annotation for Microblog Topics Using Wikipedia Temporal Information},
  year = 2015
}

@inproceedings{DBLP:conf/dsaa/KrompassNT14,
  author = {Krompass, Denis and Nickel, Maximilian and Tresp, Volker},
  bibsource = {dblp computer science bibliography, http://dblp.org},
  booktitle = {International Conference on Data Science and Advanced Analytics, {DSAA}               2014, Shanghai, China, October 30 - November 1, 2014},
  crossref = {DBLP:conf/dsaa/2014},
  doi = {10.1109/DSAA.2014.7058046},
  interhash = {0ca986606c22ca0b3780c9b9c25f31c7},
  intrahash = {c952ed96ece470e4fa5336eedf670d5b},
  isbn = {978-1-4799-6991-3},
  pages = {18--24},
  publisher = {{IEEE}},
  title = {Large-scale factorization of type-constrained multi-relational data},
  url = {http://dx.doi.org/10.1109/DSAA.2014.7058046},
  year = 2014
}

@inproceedings{conf/icdm/DuBJ10,
  author = {Du, Lan and Buntine, Wray Lindsay and Jin, Huidong},
  booktitle = {ICDM},
  crossref = {conf/icdm/2010},
  editor = {Webb, Geoffrey I. and 0001, Bing Liu and Zhang, Chengqi and Gunopulos, Dimitrios and Wu, Xindong},
  ee = {http://doi.ieeecomputersociety.org/10.1109/ICDM.2010.51},
  interhash = {dcde7dbdd419330aabb01d151e23c45c},
  intrahash = {5a639efaf1e8fea6b0f309333efd7bee},
  isbn = {978-0-7695-4256-0},
  pages = {148-157},
  publisher = {IEEE Computer Society},
  title = {Sequential Latent Dirichlet Allocation: Discover Underlying Topic Structures within a Document.},
  url = {http://dblp.uni-trier.de/db/conf/icdm/icdm2010.html#DuBJ10},
  year = 2010
}

@inproceedings{conf/conll/LevyG14,
  author = {Levy, Omer and Goldberg, Yoav},
  booktitle = {CoNLL},
  crossref = {conf/conll/2014},
  editor = {Morante, Roser and tau Yih, Wen},
  ee = {http://aclweb.org/anthology/W/W14/W14-1618.pdf},
  interhash = {680dde1fd83a8dd0d6b2619a8266516e},
  intrahash = {23bb00b6abab97ed93e74f3b5b148630},
  isbn = {978-1-941643-02-0},
  pages = {171-180},
  publisher = {ACL},
  title = {Linguistic Regularities in Sparse and Explicit Word Representations.},
  url = {http://dblp.uni-trier.de/db/conf/conll/conll2014.html#LevyG14},
  year = 2014
}

@article{grimmer2013text,
  author = {Grimmer, Justin and Stewart, Brandon M},
  interhash = {eb68e01ef4168a398d79f408042fe529},
  intrahash = {76001ebc726700bef81886d2e285b7cf},
  journal = {Political Analysis},
  pages = {mps028},
  publisher = {SPM-PMSAPSA},
  title = {Text as data: The promise and pitfalls of automatic content analysis methods for political texts},
  year = 2013
}

@inproceedings{noauthororeditor,
  author = {Mirowski, Piotr and Ranzato, Marc'Aurelio and LeCun, Yann},
  editor = {of the NIPS 2010 Workshop on Deep Learning, Proceedings},
  interhash = {b7ce347e904a4ca3263cf6cc1e2253bd},
  intrahash = {fc3e0e3af595f9a46df6bc9233df836f},
  title = {Dynamic Auto-Encoders for Semantic Indexing},
  url = {http://yann.lecun.com/exdb/publis/pdf/mirowski-nipsdl-10.pdf},
  year = 2010
}

@article{SSQU:SSQU478,
  abstract = {Objective.  This study is an effort to produce a more systematic, empirically-based, historical-comparative understanding of media bias than generally is found in previous works.Methods.  The research employs a quantitative measure of ideological bias in a formal content analysis of the United States' two largest circulation news magazines, Time and Newsweek. Findings are compared with the results of an identical examination of two of the nation's leading partisan journals, the conservative National Review and the liberal Progressive.Results.  Bias scores reveal stark differences between the mainstream and the partisan news magazines' coverage of four issue areas: crime, the environment, gender, and poverty.Conclusion.  Data provide little support for those claiming significant media bias in either ideological direction.},
  author = {Covert, Tawnya J. Adkins and Wasburn, Philo C.},
  doi = {10.1111/j.1540-6237.2007.00478.x},
  interhash = {9276222b3b8684048db1e42c3a9f3409},
  intrahash = {81474f00e1605d45462e23f743dc88bb},
  issn = {1540-6237},
  journal = {Social Science Quarterly},
  number = 3,
  pages = {690--706},
  publisher = {Blackwell Publishing Inc},
  title = {Measuring Media Bias: A Content Analysis of Time and Newsweek Coverage of Domestic Social Issues, 1975–2000*},
  url = {http://dx.doi.org/10.1111/j.1540-6237.2007.00478.x},
  volume = 88,
  year = 2007
}

@article{noKey,
  abstract = {The extensive literature documenting the ecological effects of roads has repeatedly implicated noise as one of the causal factors. Recent studies of wildlife responses to noise have decisively identified changes in animal behaviors and spatial distributions that are caused by noise. Collectively, this research suggests that spatial extent and intensity of potential noise impacts to wildlife can be studied by mapping noise sources and modeling the propagation of noise across landscapes. Here we present models of energy extraction, aircraft overflight and roadway noise as examples of spatially extensive sources and to present tools available for landscape scale investigations. We focus these efforts in US National Parks (Mesa Verde, Grand Teton and Glacier) to highlight that ecological noise pollution is not a threat restricted to developed areas and that many protected natural areas experience significant noise loads. As a heuristic tool for understanding past and future noise pollution we forecast community noise utilizing a spatially-explicit land-use change model that depicts the intensity of human development at sub-county resolution. For road noise, we transform effect distances from two studies into sound levels to begin a discussion of noise thresholds for wildlife. The spatial scale of noise exposure is far larger than any protected area, and no site in the continental US is free form noise. The design of observational and experimental studies of noise effects should be informed by knowledge of regional noise exposure patterns.},
  author = {Barber, Jesse R. and Burdett, Chris L. and Reed, Sarah E. and Warner, Katy A. and Formichella, Charlotte and Crooks, Kevin R. and Theobald, Dave M. and Fristrup, Kurt M.},
  doi = {10.1007/s10980-011-9646-7},
  interhash = {ebd2433210dffb7fecae1dcf14b4fa6b},
  intrahash = {17c859ff5dba77ef46cb7677f5221519},
  issn = {0921-2973},
  journal = {Landscape Ecology},
  language = {English},
  number = 9,
  pages = {1281-1295},
  publisher = {Springer Netherlands},
  title = {Anthropogenic noise exposure in protected natural areas: estimating the scale of ecological consequences},
  url = {http://dx.doi.org/10.1007/s10980-011-9646-7},
  volume = 26,
  year = 2011
}

@inproceedings{Kumar:2015:IS:2684822.2685310,
  abstract = {We consider the problem of inferring choices made by users based only on aggregate data containing the relative popularity of each item. We propose a framework that models the problem as that of inferring a Markov chain given a stationary distribution. Formally, we are given a graph and a target steady-state distribution on its nodes. We are also give a mapping from per-node scores to a transition matrix, from a broad family of such mappings. The goal is to set the scores of each node such that the resulting transition matrix induces the desired steady state. We prove sufficient conditions under which this problem is feasible and, for the feasible instances, obtain a simple algorithm for a generic version of the problem. This iterative algorithm provably finds the unique solution to this problem and has a polynomial rate of convergence; in practice we find that the algorithm converges after fewer than ten iterations. We then apply this framework to choice problems in online settings and show that our algorithm is able to explain the observed data and predict the user choices much better than other competing baselines across a variety of diverse datasets.},
  acmid = {2685310},
  address = {New York, NY, USA},
  author = {Kumar, Ravi and Tomkins, Andrew and Vassilvitskii, Sergei and Vee, Erik},
  booktitle = {Proceedings of the Eighth ACM International Conference on Web Search and Data Mining},
  doi = {10.1145/2684822.2685310},
  interhash = {15326871c92155e46259db7cb455d584},
  intrahash = {e0e10a01d0f65da00f5390482407abd2},
  isbn = {978-1-4503-3317-7},
  location = {Shanghai, China},
  numpages = {10},
  pages = {359--368},
  publisher = {ACM},
  series = {WSDM '15},
  title = {Inverting a Steady-State},
  url = {http://doi.acm.org/10.1145/2684822.2685310},
  year = 2015
}

@inproceedings{mitchell2015,
  author = {Mitchell, T. and Cohen, W. and Hruscha, E. and Talukdar, P. and Betteridge, J. and Carlson, A. and Dalvi, B. and Gardner, M. and Kisiel, B. and Krishnamurthy, J. and Lao, N. and Mazaitis, K. and Mohammad, T. and Nakashole, N. and Platanios, E. and Ritter, A. and Samadi, M. and Settles, B. and Wang, R. and Wijaya, D. and Gupta, A. and Chen, X. and Saparov, A. and Greaves, M. and Welling, J.},
  booktitle = {AAAI},
  interhash = {52d0d71f6f5b332dabc1412f18e3a93d},
  intrahash = {63070703e6bb812852cca56574aed093},
  note = {: Never-Ending Learning in AAAI-2015},
  title = {Never-Ending Learning},
  url = {http://www.cs.cmu.edu/~wcohen/pubs.html},
  year = 2015
}

@article{kataria2011context,
  abstract = {In a document network such as a citation network of scientific documents, web-logs etc., the content produced by authors exhibit their interest in certain topics. In addition some authors influence other authors' interests. In this work, we propose to model the influence of cited authors along with the interests of citing authors. Morover , we hypothesize that citations present in documents, the context surrounding the citation mention provides extra topical information about the cited authors. However, associating terms in the context to the cited authors remains an open problem. We propose novel document generation schemes that incorporate the context while simultaneously modeling the interests of citing authors and influence of the cited authors. Our experiments show significant improvements over baseline models for various evaluation criteria such as link prediction between document and cited author, and quantitatively explaining unseen text.},
  author = {Kataria, Saurabh and Mitra, Prasenjit and Caragea, Cornelia and Giles, C.},
  conference = {International Joint Conference on Artificial Intelligence},
  interhash = {7496b4df1335fbc6aea691cecb65289d},
  intrahash = {dc774d17ec721be6d32530d265f34539},
  title = {Context Sensitive Topic Models for Author Influence in Document Networks},
  url = {https://www.aaai.org/ocs/index.php/IJCAI/IJCAI11/paper/view/3140},
  year = 2011
}

@article{thurau2012descriptive,
  abstract = {Climate change, the global energy footprint, and strategies for sustainable development have become topics of considerable political and public interest. The public debate is informed by an exponentially growing amount of data and there are diverse partisan interest when it comes to interpretation. We therefore believe that data analysis methods are called for that provide results which are intuitively understandable even to non-experts. Moreover, such methods should be efficient so that non-experts users can perform their own analysis at low expense in order to understand the effects of different parameters and influential factors. In this paper, we discuss a new technique for factorizing data matrices that meets both these requirements. The basic idea is to represent a set of data by means of convex combinations of extreme data points. This often accommodates human cognition. In contrast to established factorization methods, the approach presented in this paper can also determine over-complete bases. At the same time, convex combinations allow for highly efficient matrix factorization. Based on techniques adopted from the field of distance geometry, we derive a linear time algorithm to determine suitable basis vectors for factorization. By means of the example of several environmental and developmental data sets we discuss the performance and characteristics of the proposed approach and validate that significant efficiency gains are obtainable without performance decreases compared to existing convexity constrained approaches.},
  affiliation = {Fraunhofer Institute for Intelligent Analysis and Information Systems IAIS, Sankt Augustin, Germany},
  author = {Thurau, Christian and Kersting, Kristian and Wahabzada, Mirwaes and Bauckhage, Christian},
  doi = {10.1007/s10618-011-0216-z},
  interhash = {457c57f054fea45dcbc8447263591d97},
  intrahash = {387f4e1711d7065bd5a94455aeae1957},
  issn = {1384-5810},
  journal = {Data Mining and Knowledge Discovery},
  keyword = {Computer Science},
  number = 2,
  pages = {325-354},
  publisher = {Springer Netherlands},
  title = {Descriptive matrix factorization for sustainability Adopting the principle of opposites},
  url = {http://dx.doi.org/10.1007/s10618-011-0216-z},
  volume = 24,
  year = 2012
}

@inproceedings{Ramage:2009:LLS:1699510.1699543,
  abstract = {A significant portion of the world's text is tagged by readers on social bookmarking websites. Credit attribution is an inherent problem in these corpora because most pages have multiple tags, but the tags do not always apply with equal specificity across the whole document. Solving the credit attribution problem requires associating each word in a document with the most appropriate tags and vice versa. This paper introduces Labeled LDA, a topic model that constrains Latent Dirichlet Allocation by defining a one-to-one correspondence between LDA's latent topics and user tags. This allows Labeled LDA to directly learn word-tag correspondences. We demonstrate Labeled LDA's improved expressiveness over traditional LDA with visualizations of a corpus of tagged web pages from del.icio.us. Labeled LDA outperforms SVMs by more than 3 to 1 when extracting tag-specific document snippets. As a multi-label text classifier, our model is competitive with a discriminative baseline on a variety of datasets.},
  acmid = {1699543},
  address = {Stroudsburg, PA, USA},
  author = {Ramage, Daniel and Hall, David and Nallapati, Ramesh and Manning, Christopher D.},
  booktitle = {Proceedings of the 2009 Conference on Empirical Methods in Natural Language Processing: Volume 1 - Volume 1},
  interhash = {45315f4da7b10debdca560506cf0d7ba},
  intrahash = {6e7173f084e26bca9a8d2a1ab4a5b709},
  isbn = {978-1-932432-59-6},
  location = {Singapore},
  numpages = {9},
  pages = {248--256},
  publisher = {Association for Computational Linguistics},
  series = {EMNLP '09},
  title = {Labeled LDA: A Supervised Topic Model for Credit Attribution in Multi-labeled Corpora},
  url = {http://dl.acm.org/citation.cfm?id=1699510.1699543},
  year = 2009
}

@inproceedings{conf/pkdd/BalasubramanyanDC13,
  author = {Balasubramanyan, Ramnath and Dalvi, Bhavana Bharat and Cohen, William W.},
  booktitle = {ECML/PKDD (2)},
  crossref = {conf/pkdd/2013-2},
  editor = {Blockeel, Hendrik and Kersting, Kristian and Nijssen, Siegfried and Zelezný, Filip},
  ee = {http://dx.doi.org/10.1007/978-3-642-40991-2_40},
  interhash = {9a32b7cc059a500ea302d0aa65036682},
  intrahash = {e56623d21a1b7bcb442cd15fe098bb70},
  isbn = {978-3-642-40990-5},
  pages = {628-642},
  publisher = {Springer},
  series = {Lecture Notes in Computer Science},
  title = {From Topic Models to Semi-supervised Learning: Biasing Mixed-Membership Models to Exploit Topic-Indicative Features in Entity Clustering.},
  url = {http://dblp.uni-trier.de/db/conf/pkdd/pkdd2013-2.html#BalasubramanyanDC13},
  volume = 8189,
  year = 2013
}

@article{Alonso2009273,
  author = {Alonso, S. and Cabrerizo, F.J. and Herrera-Viedma, E. and Herrera, F.},
  doi = {http://dx.doi.org/10.1016/j.joi.2009.04.001},
  interhash = {cbf95718465346edecef397149e4cf51},
  intrahash = {859c208f329fa96e26e35f1bcb7ab65d},
  issn = {1751-1577},
  journal = {Journal of Informetrics },
  number = 4,
  pages = {273 - 289},
  title = {h-Index: A review focused in its variants, computation and standardization for different scientific fields },
  url = {http://www.sciencedirect.com/science/article/pii/S1751157709000339},
  volume = 3,
  year = 2009
}

@inproceedings{Sautter:2012:IBR:2403832.2403883,
  abstract = {Parsing details like author names and titles out of bibliographic references of scientific publications is an important issue. However, most existing techniques are tailored to the highly standardized reference styles used in the last two to three decades. Their performance tends to degrade when faced with the wider variety of reference styles used in older, historic publications. Thus, existing techniques are of limited use when creating comprehensive bibliographies covering both historic and contemporary scientific publications. This paper presents RefParse, a generic approach to bibliographic reference parsing that is independent of any specific reference style. Its core feature is an inference mechanism that exploits the regularities inherent in any list of references to deduce its format. Our evaluation shows that RefParse outperforms existing parsers both for contemporary and for historic reference lists.},
  acmid = {2403883},
  address = {Berlin, Heidelberg},
  author = {Sautter, Guido and B\"{o}hm, Klemens},
  booktitle = {Proceedings of the Second International Conference on Theory and Practice of Digital Libraries},
  doi = {10.1007/978-3-642-33290-6_40},
  interhash = {20fe241af3945dca2e242ae72eae05ad},
  intrahash = {ce9a27e85a0cc6bef109d5130e7ed1ea},
  isbn = {978-3-642-33289-0},
  location = {Paphos, Cyprus},
  numpages = {13},
  pages = {370--382},
  publisher = {Springer-Verlag},
  series = {TPDL'12},
  title = {Improved Bibliographic Reference Parsing Based on Repeated Patterns},
  url = {http://dx.doi.org/10.1007/978-3-642-33290-6_40},
  year = 2012
}

@misc{goldenberg2009survey,
  abstract = {Networks are ubiquitous in science and have become a focal point for
discussion in everyday life. Formal statistical models for the analysis of
network data have emerged as a major topic of interest in diverse areas of
study, and most of these involve a form of graphical representation.
Probability models on graphs date back to 1959. Along with empirical studies in
social psychology and sociology from the 1960s, these early works generated an
active network community and a substantial literature in the 1970s. This effort
moved into the statistical literature in the late 1970s and 1980s, and the past
decade has seen a burgeoning network literature in statistical physics and
computer science. The growth of the World Wide Web and the emergence of online
networking communities such as Facebook, MySpace, and LinkedIn, and a host of
more specialized professional network communities has intensified interest in
the study of networks and network data. Our goal in this review is to provide
the reader with an entry point to this burgeoning literature. We begin with an
overview of the historical development of statistical network modeling and then
we introduce a number of examples that have been studied in the network
literature. Our subsequent discussion focuses on a number of prominent static
and dynamic network models and their interconnections. We emphasize formal
model descriptions, and pay special attention to the interpretation of
parameters and their estimation. We end with a description of some open
problems and challenges for machine learning and statistics.},
  author = {Goldenberg, Anna and Zheng, Alice X and Fienberg, Stephen E and Airoldi, Edoardo M},
  interhash = {bab22de06306d84cf357aadf48982d87},
  intrahash = {5e341981218d7cd89416c3371d56c794},
  note = {cite arxiv:0912.5410Comment: 96 pages, 14 figures, 333 references},
  title = {A survey of statistical network models},
  url = {http://arxiv.org/abs/0912.5410},
  year = 2009
}

@inproceedings{journals/jmlr/ChangB09,
  author = {Chang, Jonathan and Blei, David M.},
  booktitle = {AISTATS},
  crossref = {conf/aistats/2009},
  editor = {Dyk, David A. Van and Welling, Max},
  ee = {http://www.jmlr.org/proceedings/papers/v5/chang09a.html},
  interhash = {f3431fd69b315a22422a2c0f15ee0b71},
  intrahash = {86f665b74ecabb56e81542e0f052a331},
  pages = {81-88},
  publisher = {JMLR.org},
  series = {JMLR Proceedings},
  title = {Relational Topic Models for Document Networks.},
  url = {http://dblp.uni-trier.de/db/journals/jmlr/jmlrp5.html#ChangB09},
  volume = 5,
  year = 2009
}

@inproceedings{Stenneth:2011:TMD:2093973.2093982,
  abstract = {The transportation mode such as walking, cycling or on a train denotes an important characteristic of the mobile user's context. In this paper, we propose an approach to inferring a user's mode of transportation based on the GPS sensor on her mobile device and knowledge of the underlying transportation network. The transportation network information considered includes real time bus locations, spatial rail and spatial bus stop information. We identify and derive the relevant features related to transportation network information to improve classification effectiveness. This approach can achieve over 93.5% accuracy for inferring various transportation modes including: car, bus, aboveground train, walking, bike, and stationary. Our approach improves the accuracy of detection by 17% in comparison with the GPS only approach, and 9% in comparison with GPS with GIS models. The proposed approach is the first to distinguish between motorized transportation modes such as bus, car and aboveground train with such high accuracy. Additionally, if a user is travelling by bus, we provide further information about which particular bus the user is riding. Five different inference models including Bayesian Net, Decision Tree, Random Forest, Naïve Bayesian and Multilayer Perceptron, are tested in the experiments. The final classification system is deployed and available to the public.},
  acmid = {2093982},
  address = {New York, NY, USA},
  author = {Stenneth, Leon and Wolfson, Ouri and Yu, Philip S. and Xu, Bo},
  booktitle = {Proceedings of the 19th ACM SIGSPATIAL International Conference on Advances in Geographic Information Systems},
  doi = {10.1145/2093973.2093982},
  interhash = {07950385ca6bb9138db4f20bb3dd7698},
  intrahash = {6eff579bee29983fbf72403faa9b04ae},
  isbn = {978-1-4503-1031-4},
  location = {Chicago, Illinois},
  numpages = {10},
  pages = {54--63},
  publisher = {ACM},
  series = {GIS '11},
  title = {Transportation Mode Detection Using Mobile Phones and GIS Information},
  url = {http://doi.acm.org/10.1145/2093973.2093982},
  year = 2011
}

@book{DBLP:books/crc/aggarwal2013,
  bibsource = {DBLP, http://dblp.uni-trier.de},
  editor = {Aggarwal, Charu C. and Reddy, Chandan K.},
  ee = {http://www.crcpress.com/product/isbn/9781466558212, http://www.charuaggarwal.net/clusterbook.pdf},
  interhash = {5f150f838457faaa3805b0ed034c845f},
  intrahash = {7f1541e5800e6c36c67dd6bc0ef64ba7},
  isbn = {978-1-46-655821-2},
  publisher = {CRC Press},
  title = {Data Clustering: Algorithms and Applications},
  url = {http://www.charuaggarwal.net/clusterbook.pdf},
  year = 2014
}

@misc{lan2013joint,
  abstract = {Modern machine learning methods are critical to the development of
large-scale personalized learning systems that cater directly to the needs of
individual learners. The recently developed SPARse Factor Analysis (SPARFA)
framework provides a new statistical model and algorithms for machine
learning-based learning analytics, which estimate a learner's knowledge of the
latent concepts underlying a domain, and content analytics, which estimate the
relationships among a collection of questions and the latent concepts. SPARFA
estimates these quantities given only the binary-valued graded responses to a
collection of questions. In order to better interpret the estimated latent
concepts, SPARFA relies on a post-processing step that utilizes user-defined
tags (e.g., topics or keywords) available for each question. In this paper, we
relax the need for user-defined tags by extending SPARFA to jointly process
both graded learner responses and the text of each question and its associated
answer(s) or other feedback. Our purely data-driven approach (i) enhances the
interpretability of the estimated latent concepts without the need of
explicitly generating a set of tags or performing a post-processing step, (ii)
improves the prediction performance of SPARFA, and (iii) scales to large
test/assessments where human annotation would prove burdensome. We demonstrate
the efficacy of the proposed approach on two real educational datasets.},
  author = {Lan, Andrew S. and Studer, Christoph and Waters, Andrew E. and Baraniuk, Richard G.},
  interhash = {911707523671c994e5c3fe63c3df5c4a},
  intrahash = {2a8df43258181ed85e5d43b489fd45fb},
  note = {cite arxiv:1305.1956},
  title = {Joint Topic Modeling and Factor Analysis of Textual Information and
  Graded Response Data},
  url = {http://arxiv.org/abs/1305.1956},
  year = 2013
}

@article{piatkowski2013spatiotemporal,
  author = {Piatkowski, Nico and Lee, Sangkyun and Morik, Katharina},
  doi = {10.1007/s10994-013-5399-7},
  interhash = {314e29a1c444118b8a4e8d2ba7ab6336},
  intrahash = {eed8d4fcd9cfc30c01c1bf72e8e9cdbb},
  issn = {0885-6125},
  journal = {Machine Learning},
  language = {English},
  number = 1,
  pages = {115-139},
  publisher = {Springer US},
  title = {Spatio-temporal random fields: compressible representation and distributed estimation},
  url = {http://dx.doi.org/10.1007/s10994-013-5399-7},
  volume = 93,
  year = 2013
}

@inproceedings{angeli-manning:2013:CoNLL-2013,
  address = {Sofia, Bulgaria},
  author = {Angeli, Gabor and Manning, Christopher},
  booktitle = {Proceedings of the Seventeenth Conference on Computational Natural Language Learning},
  interhash = {8f5a64ab020f6ff5da7613f8a330b64e},
  intrahash = {60b3f16a1e9616605808ff0cae125cf9},
  month = {August},
  pages = {133--142},
  publisher = {Association for Computational Linguistics},
  title = {Philosophers are Mortal: Inferring the Truth of Unseen Facts},
  url = {http://www.aclweb.org/anthology/W13-3515},
  year = 2013
}

@inproceedings{noauthororeditor2011noisemap,
  editor = {Second International Workshop on Sensing Applications on Mobile Phones, ACM SenSys 2011},
  interhash = {fb255bb53f64fc403f4e03cb73577bb5},
  intrahash = {248ff44fcf7b397c118a33116d05326b},
  title = {NoiseMap - Real-time participatory noise maps},
  url = {http://research.microsoft.com/en-us/um/redmond/events/phonesense2011/papers/NoiseMap.pdf},
  year = 2011
}

@article{DBLP:journals/jise/YangK13,
  author = {Yang, Wen-Teng and Kao, Hung-Yu},
  bibsource = {DBLP, http://dblp.uni-trier.de},
  ee = {http://www.iis.sinica.edu.tw/page/jise/2013/201307_01.html},
  interhash = {1c27ed73a081f71136c5d58067127342},
  intrahash = {aec1722879a25d5eb2ee9035803d9218},
  journal = {J. Inf. Sci. Eng.},
  number = 4,
  pages = {615-630},
  title = {Measuring Semantic Relatedness using Wikipedia Signed Network},
  volume = 29,
  year = 2013
}

@article{10.1109/TKDE.2012.115,
  address = {Los Alamitos, CA, USA},
  author = {Zubiaga, Arkaitz and Fresno, Victor and Martinez, Raquel and Garcia-Plaza, Alberto P.},
  doi = {http://doi.ieeecomputersociety.org/10.1109/TKDE.2012.115},
  interhash = {f2e961e2b99fec0634b0d4fa3e001282},
  intrahash = {8a25332bfeb33e2ad8e1e1a062976da2},
  issn = {1041-4347},
  journal = {IEEE Transactions on Knowledge and Data Engineering},
  number = {PrePrints},
  publisher = {IEEE Computer Society},
  title = {Harnessing Folksonomies to Produce a Social Classification of Resources},
  volume = 99,
  year = 2012
}

@inproceedings{Yan:2012:BSS:2232817.2232831,
  abstract = {Usually scientists breed research ideas inspired by previous publications, but they are unlikely to follow all publications in the unbounded literature collection. The volume of literature keeps on expanding extremely fast, whilst not all papers contribute equal impact to the academic society. Being aware of potentially influential literature would put one in an advanced position in choosing important research references. Hence, estimation of potential influence is of great significance. We study a challenging problem of identifying potentially influential literature. We examine a set of hypotheses on what are the fundamental characteristics for highly cited papers and find some interesting patterns. Based on these observations, we learn to identify potentially influential literature via Future Influence Prediction (FIP), which aims to estimate the future influence of literature. The system takes a series of features of a particular publication as input and produces as output the estimated citation counts of that article after a given time period. We consider several regression models to formulate the learning process and evaluate their performance based on the coefficient of determination (R2). Experimental results on a real-large data set show a mean average predictive performance of 83.6% measured in R^2. We apply the learned model to the application of bibliography recommendation and obtain prominent performance improvement in terms of Mean Average Precision (MAP).},
  acmid = {2232831},
  address = {New York, NY, USA},
  author = {Yan, Rui and Huang, Congrui and Tang, Jie and Zhang, Yan and Li, Xiaoming},
  booktitle = {Proceedings of the 12th ACM/IEEE-CS joint conference on Digital Libraries},
  doi = {10.1145/2232817.2232831},
  interhash = {85d10c6d37bcbfa057c51acc325a8116},
  intrahash = {9269d2dd9bf4bc8c0e7c668011fcfc1b},
  isbn = {978-1-4503-1154-0},
  location = {Washington, DC, USA},
  numpages = {10},
  pages = {51--60},
  publisher = {ACM},
  series = {JCDL '12},
  title = {To better stand on the shoulder of giants},
  url = {http://doi.acm.org/10.1145/2232817.2232831},
  year = 2012
}

@misc{haslhofer2013semantic,
  abstract = {Tags assigned by users to shared content can be ambiguous. As a possible solution, we propose semantic tagging as a collaborative process in which a user selects and associates Web resources drawn from a knowledge context. We applied this general technique in the specific context of online historical maps and allowed users to annotate and tag them. To study the effects of semantic tagging on tag production, the types and categories of obtained tags, and user task load, we conducted an in-lab within-subject experiment with 24 participants who annotated and tagged two distinct maps. We found that the semantic tagging implementation does not affect these parameters, while providing tagging relationships to well-defined concept definitions. Compared to label-based tagging, our technique also gathers positive and negative tagging relationships. We believe that our findings carry implications for designers who want to adopt semantic tagging in other contexts and systems on the Web.},
  author = {Haslhofer, Bernhard and Robitza, Werner and Lagoze, Carl and Guimbretiere, Francois},
  interhash = {84516aa456894b6d6adf86abd2386656},
  intrahash = {a653f1a0a1ac5084e80757ec277b1184},
  note = {cite arxiv:1304.1636Comment: 10 pages},
  title = {Semantic Tagging on Historical Maps},
  url = {http://arxiv.org/abs/1304.1636},
  year = 2013
}

@inproceedings{aya2005citation,
  abstract = {Citation analysis has been used to study various aspects of scholarly communication. In general, these studies have not differentiated among the multiple reasons for citations. However, authors cite other works for a number of reasons including demonstrating knowledge of the field, establishing the placement of the citing work in the field, comparing and criticizing other works, and paying homage to seminal work by pioneers in the field. In this paper, we present a number of applications in which distinguishing among authors' motivations for citations might be useful and present a machine learning approach to automatically classifying citations according to these motivations. Our approach to citation classification makes use of the structure and the argumentative nature of the scientific papers. We present the results of experiments we ran on papers in the computer science field. The results are encouraging and give us hope that we can use our citation classifier in analyzing large corpora of scientific papers.},
  author = {Aya, Selcuk and Lagoze, Carl and Joachims, Thorsten},
  booktitle = {Proceedings of the International Conference on Knowledge Management},
  chapter = 24,
  doi = {10.1142/9789812701527_0024},
  eprint = {http://www.worldscientific.com/doi/pdf/10.1142/9789812701527_0024},
  interhash = {f35b1f099571f3f134186ff407ee5fee},
  intrahash = {d30bac9f744e0473499f1d15d55258b8},
  month = oct,
  pages = {287--298},
  publisher = {World Scientific Publishing},
  title = {Citation Classification and its Applications},
  url = {http://www.worldscientific.com/doi/abs/10.1142/9789812701527_0024},
  year = 2005
}

@inproceedings{Kaur:2005:CLW:1054972.1054980,
  abstract = {A predictive tool to simulate human visual search behavior would help interface designers inform and validate their design. Such a tool would benefit from a semantic component that would help predict search behavior even in the absence of exact textual matches between goal and target. This paper discusses a comparison of three semantic systems-LSA, WordNet and PMI-IR-to evaluate their performance in predicting the link that people would select given an information goal and a webpage. PMI-IR best predicted human performance as observed in a user study.},
  acmid = {1054980},
  address = {New York, NY, USA},
  author = {Kaur, Ishwinder and Hornof, Anthony J.},
  booktitle = {Proceedings of the SIGCHI Conference on Human Factors in Computing Systems},
  doi = {10.1145/1054972.1054980},
  interhash = {ea35528c6c3ea3ca64cbbd6c6ae631ae},
  intrahash = {f8c070cb738ea82a40838b1eb8257e31},
  isbn = {1-58113-998-5},
  location = {Portland, Oregon, USA},
  numpages = {10},
  pages = {51--60},
  publisher = {ACM},
  series = {CHI '05},
  title = {A comparison of LSA, wordNet and PMI-IR for predicting user click behavior},
  url = {http://doi.acm.org/10.1145/1054972.1054980},
  year = 2005
}

@article{morstatter2013sample,
  author = {Morstatter, Fred and {\"u}rgen Pfeffer, J and Liu, Huan and Carley, Kathleen M},
  interhash = {bca742d25a5f5fa43c8f106460449b5b},
  intrahash = {58707a28cc5098b9b3444501d5ca9a88},
  title = {Is the Sample Good Enough? Comparing Data from Twitters Streaming API with Twitters Firehose},
  url = {http://scholar.google.de/scholar.bib?q=info:NkS2afIrqyQJ:scholar.google.com/&output=citation&hl=de&as_sdt=0,5&ct=citation&cd=0},
  year = 2013
}

@article{doi:10.1080/13504509.2013.779326,
  author = {Li, Chunming and Wei, Dong and Vause, Jonathan and Liu, Jianping},
  doi = {10.1080/13504509.2013.779326},
  eprint = {http://www.tandfonline.com/doi/pdf/10.1080/13504509.2013.779326},
  interhash = {397604f8584c402f7c14dc2d2935baaa},
  intrahash = {a5a19958a5f397b467abdabe2f8adf69},
  journal = {International Journal of Sustainable Development & World Ecology},
  number = 0,
  pages = {1-6},
  title = {Towards a societal scale environmental sensing network with public participation},
  url = {http://www.tandfonline.com/doi/abs/10.1080/13504509.2013.779326},
  volume = 0,
  year = 0
}

@misc{titov2008modeling,
  abstract = {In this paper we present a novel framework for extracting the ratable aspects of objects from online user reviews. Extracting such aspects is an important challenge in automatically mining product opinions from the web and in generating opinion-based summaries of user reviews. Our models are based on extensions to standard topic modeling methods such as LDA and PLSA to induce multi-grain topics. We argue that multi-grain models are more appropriate for our task since standard models tend to produce topics that correspond to global properties of objects (e.g., the brand of a product type) rather than the aspects of an object that tend to be rated by a user. The models we present not only extract ratable aspects, but also cluster them into coherent topics, e.g., `waitress' and `bartender' are part of the same topic `staff' for restaurants. This differentiates it from much of the previous work which extracts aspects through term frequency analysis with minimal clustering. We evaluate the multi-grain models both qualitatively and quantitatively to show that they improve significantly upon standard topic models.},
  author = {Titov, Ivan and McDonald, Ryan},
  interhash = {00cbf1df09c3f2c65d5a31a0537aed3f},
  intrahash = {f3286f5efa0115f465563d0259c32255},
  note = {cite arxiv:0801.1063},
  title = {Modeling Online Reviews with Multi-grain Topic Models},
  url = {http://arxiv.org/abs/0801.1063},
  year = 2008
}

@article{Juhos20081488,
  abstract = {The main aim of this paper is to predict NO and NO2 concentrations four days in advance comparing two artificial intelligence learning methods, namely, Multi-Layer Perceptron and Support Vector Machines on two kinds of spatial embedding of the temporal time series. Hourly values of NO and NO2 concentrations, as well as meteorological variables were recorded in a cross-road monitoring station with heavy traffic in Szeged in order to build a model for predicting NO and NO2 concentrations several hours in advance. The prediction of NO and NO2 concentrations was performed partly on the basis of their past values, and partly on the basis of temperature, humidity and wind speed data. Since NO can be predicted more accurately, its values were considered primarily when forecasting NO2. Time series prediction can be interpreted in a way that is suitable for artificial intelligence learning. Two effective learning methods, namely, Multi-Layer Perceptron and Support Vector Regression are used to provide efficient non-linear models for NO and NO2 times series predictions. Multi-Layer Perceptron is widely used to predict these time series, but Support Vector Regression has not yet been applied for predicting NO and NO2 concentrations. Grid search is applied to select the best parameters for the learners. To get rid of the curse of dimensionality of the spatial embedding of the time series Principal Component Analysis is taken to reduce the dimension of the embedded data. Three commonly used linear algorithms were considered as references: one-day persistence, average of several-day persistence and linear regression. Based on the good results of the average of several-day persistence, a prediction scheme was introduced, which forms weighted averages instead of simple ones. The optimization of these weights was performed with linear regression in linear case and with the learning methods mentioned in non-linear case. Concerning the NO predictions, the non-linear learning methods give significantly better predictions than the reference linear methods. In the case of NO2 the improvement of the prediction is considerable; however, it is less notable than for NO.},
  author = {Juhos, István and Makra, László and Tóth, Balázs},
  doi = {10.1016/j.simpat.2008.08.006},
  interhash = {b8e240cb2c8bb4d0f42aeda944a3ed15},
  intrahash = {70d6cf3c171445620c5024658516ac44},
  issn = {1569-190X},
  journal = {Simulation Modelling Practice and Theory},
  number = 9,
  pages = {1488 - 1502},
  title = {Forecasting of traffic origin NO and NO2 concentrations by Support Vector Machines and neural networks using Principal Component Analysis},
  url = {http://www.sciencedirect.com/science/article/pii/S1569190X08001585},
  volume = 16,
  year = 2008
}

@article{sun2013social,
  author = {Sun, Xiaoling and Kaur, Jasleen and Milojevic, Stasa and Flammini, Alessandro and Menczer, Filippo},
  comment = {10.1038/srep01069},
  interhash = {5cd31392e997555d78596f962044f84b},
  intrahash = {ed28353b082f3ccbd23ea85ea9d7c8e5},
  journal = {Sci. Rep.},
  month = jan,
  publisher = {Macmillan Publishers Limited. All rights reserved},
  title = {Social Dynamics of Science},
  url = {http://dx.doi.org/10.1038/srep01069},
  volume = 3,
  year = 2013
}

@article{wilson2012practices,
  abstract = {Scientists spend an increasing amount of time building and using software. However, most scientists are never taught how to do this efficiently. As a result, many are unaware of tools and practices that would allow them to write more reliable and maintainable code with less effort. We describe a set of best practices for scientific software development that have solid foundations in research and experience, and that improve scientists' productivity and the reliability of their software.},
  author = {Wilson, Greg and Aruliah, D. A. and Brown, C. Titus and Hong, Neil P. Chue and Davis, Matt and Guy, Richard T. and Haddock, Steven H. D. and Huff, Katy and Mitchell, Ian M. and Plumbley, Mark and Waugh, Ben and White, Ethan P. and Wilson, Paul},
  interhash = {78f98610c430aa34dc2e161bb8069401},
  intrahash = {e28ce8ccadfa439cce3bcdcb5289b499},
  journal = {CoRR},
  month = oct,
  title = {Best Practices for Scientific Computing},
  url = {http://arxiv.org/abs/1210.0530},
  volume = {abs/1210.0530},
  year = 2012
}

@article{Zhang20125759,
  abstract = {Social tagging is one of the most important ways to organize and index online resources. Recommendation in social tagging systems, e.g. tag recommendation, item recommendation and user recommendation, is used to improve the quality of tags and to ease the tagging or searching process. Existing works usually provide recommendations by analyzing relation information in social tagging systems, suffering a lot from the over sparse problem. These approaches ignore information contained in the content of resources, which we believe should be considered to improve recommendation quality and to deal with the over sparse problem. In this paper we propose a recommendation approach for social tagging systems that combines content and relation analysis in a single model. By modeling the generating process of social tagging systems in a latent Dirichlet allocation approach, we build a fully generative model for social tagging, leverage it to estimate the relation between users, tags and resources and achieve tag, item and user recommendation tasks. The model is evaluated using a CiteULike data snapshot, and results show improvements in metrics for various recommendation tasks.},
  author = {Zhang, Yin and Zhang, Bin and Gao, Kening and Guo, Pengwei and Sun, Daming},
  doi = {10.1016/j.physa.2012.05.013},
  interhash = {088ad59c786579d399aaee48db5e6a7a},
  intrahash = {84f824839090a5e20394b85a9e1cef08},
  issn = {0378-4371},
  journal = {Physica A: Statistical Mechanics and its Applications},
  number = 22,
  pages = {5759 - 5768},
  title = {Combining content and relation analysis for recommendation in social tagging systems},
  url = {http://www.sciencedirect.com/science/article/pii/S0378437112003846},
  volume = 391,
  year = 2012
}

@inproceedings{Laniado2010,
  author = {Laniado, David and Mika, Peter},
  booktitle = {International Semantic Web Conference (1)},
  crossref = {conf/semweb/2010-1},
  editor = {Patel-Schneider, Peter F. and Pan, Yue and Hitzler, Pascal and Mika, Peter and Zhang, Lei and Pan, Jeff Z. and Horrocks, Ian and Glimm, Birte},
  ee = {http://dx.doi.org/10.1007/978-3-642-17746-0_30},
  interhash = {3a63f88e11f958d548fa91fe442e1dcf},
  intrahash = {58dace4881efbd12c81ef1cc2e6bf7b9},
  isbn = {978-3-642-17745-3},
  pages = {470-485},
  publisher = {Springer},
  series = {Lecture Notes in Computer Science},
  title = {Making Sense of Twitter.},
  url = {http://dblp.uni-trier.de/db/conf/semweb/iswc2010-1.html#LaniadoM10},
  volume = 6496,
  year = 2010
}

@article{Lü20121,
  abstract = {The ongoing rapid expansion of the Internet greatly increases the necessity of effective recommender systems for filtering the abundant information. Extensive research for recommender systems is conducted by a broad range of communities including social and computer scientists, physicists, and interdisciplinary researchers. Despite substantial theoretical and practical achievements, unification and comparison of different approaches are lacking, which impedes further advances. In this article, we review recent developments in recommender systems and discuss the major challenges. We compare and evaluate available algorithms and examine their roles in the future developments. In addition to algorithms, physical aspects are described to illustrate macroscopic behavior of recommender systems. Potential impacts and future directions are discussed. We emphasize that recommendation has great scientific depth and combines diverse research fields which makes it interesting for physicists as well as interdisciplinary researchers.},
  author = {Lü, Linyuan and Medo, Matúš and Yeung, Chi Ho and Zhang, Yi-Cheng and Zhang, Zi-Ke and Zhou, Tao},
  doi = {10.1016/j.physrep.2012.02.006},
  interhash = {408fbf13302368693d501271268cda03},
  intrahash = {9594d6b87d49d22b783b9c95da1f59af},
  issn = {0370-1573},
  journal = {Physics Reports},
  note = {Recommender Systems},
  number = 1,
  pages = {1 - 49},
  title = {Recommender systems},
  url = {http://www.sciencedirect.com/science/article/pii/S0370157312000828},
  volume = 519,
  year = 2012
}

@inproceedings{Zheleva:2010:SMM:1772690.1772794,
  abstract = {User experience in social media involves rich interactions with the media content and other participants in the community. In order to support such communities, it is important to understand the factors that drive the users' engagement. In this paper we show how to define statistical models of different complexity to describe patterns of song listening in an online music community. First, we adapt the LDA model to capture music taste from listening activities across users and identify both the groups of songs associated with the specific taste and the groups of listeners who share the same taste. Second, we define a graphical model that takes into account listening sessions and captures the listening moods of users in the community. Our session model leads to groups of songs and groups of listeners with similar behavior across listening sessions and enables faster inference when compared to the LDA model. Our experiments with the data from an online media site demonstrate that the session model is better in terms of the perplexity compared to two other models: the LDA-based taste model that does not incorporate cross-session information and a baseline model that does not use latent groupings of songs.},
  acmid = {1772794},
  address = {New York, NY, USA},
  author = {Zheleva, Elena and Guiver, John and Mendes Rodrigues, Eduarda and Mili\'{c}-Frayling, Nata\v{s}a},
  booktitle = {Proceedings of the 19th international conference on World wide web},
  doi = {10.1145/1772690.1772794},
  interhash = {7386777403403d0c1b524d1a7cf8065c},
  intrahash = {3a244bf0cd60252269e3c36530e34e8f},
  isbn = {978-1-60558-799-8},
  location = {Raleigh, North Carolina, USA},
  numpages = {10},
  pages = {1019--1028},
  publisher = {ACM},
  series = {WWW '10},
  title = {Statistical models of music-listening sessions in social media},
  url = {http://doi.acm.org/10.1145/1772690.1772794},
  year = 2010
}

@misc{weston2012latent,
  abstract = {Retrieval tasks typically require a ranking of items given a query. Collaborative filtering tasks, on the other hand, learn to model user's preferences over items. In this paper we study the joint problem of recommending items to a user with respect to a given query, which is a surprisingly common task. This setup differs from the standard collaborative filtering one in that we are given a query x user x item tensor for training instead of the more traditional user x item matrix. Compared to document retrieval we do have a query, but we may or may not have content features (we will consider both cases) and we can also take account of the user's profile. We introduce a factorized model for this new task that optimizes the top-ranked items returned for the given query and user. We report empirical results where it outperforms several baselines.},
  author = {Weston, Jason and Wang, Chong and Weiss, Ron and Berenzweig, Adam},
  interhash = {d0ea194dd0e3a6f35c578439efcb8bff},
  intrahash = {79c6771a9b032497635d5f39a39e921a},
  note = {cite arxiv:1206.4603Comment: ICML2012},
  title = {Latent Collaborative Retrieval},
  url = {http://arxiv.org/abs/1206.4603},
  year = 2012
}

@inproceedings{kim2011personalized,
  abstract = {This paper looks inside FolkRank, one of the well-known folksonomy-based algorithms, to present its fundamental properties and promising possibilities for improving performance in tag recommendations. Moreover, we introduce a new way to compute a differential approach in FolkRank by representing it as a linear combination of the personalized PageRank vectors. By the linear combination, we present FolkRank's probabilistic interpretation that grasps how FolkRank works on a folksonomy graph in terms of the random surfer model. We also propose new FolkRank-like methods for tag recommendations to efficiently compute tags' rankings and thus reduce expensive computational cost of FolkRank. We show that the FolkRank approaches are feasible to recommend tags in real-time scenarios as well. The experimental evaluations show that the proposed methods provide fast tag recommendations with reasonable quality, as compared to FolkRank. Additionally, we discuss the diversity of the top n tags recommended by FolkRank and its variants.},
  acmid = {2043945},
  address = {New York, NY, USA},
  author = {Kim, Heung-Nam and El Saddik, Abdulmotaleb},
  booktitle = {Proceedings of the fifth ACM conference on Recommender systems},
  doi = {10.1145/2043932.2043945},
  interhash = {1004b267b14d0abde0f8ac3a7ceadd38},
  intrahash = {f022e60c5928e01c701d7ec539ec221b},
  isbn = {978-1-4503-0683-6},
  location = {Chicago, Illinois, USA},
  numpages = {8},
  pages = {45--52},
  publisher = {ACM},
  title = {Personalized PageRank vectors for tag recommendations: inside FolkRank},
  url = {http://doi.acm.org/10.1145/2043932.2043945},
  year = 2011
}

@article{Stankovic:1988:MRC:50810.50811,
  abstract = {The author defines real-time computing and states and dispels the most common misconceptions about it. He discusses the fundamental technical issues of real-time computing. He examines specification and verification, scheduling theory, operating systems, programming languages and design methodology, distributed databases, artificial intelligence, fault tolerance, architectures, and communication.},
  acmid = {50811},
  address = {Los Alamitos, CA, USA},
  author = {Stankovic, John A.},
  doi = {10.1109/2.7053},
  interhash = {baf5af4c9407f83f4076d071190b686d},
  intrahash = {59ace6dde76eabc73fbee8800c1c9aca},
  issn = {0018-9162},
  issue_date = {October 1988},
  journal = {Computer},
  month = oct,
  number = 10,
  numpages = {10},
  pages = {10--19},
  publisher = {IEEE Computer Society Press},
  title = {Misconceptions About Real-Time Computing: A Serious Problem for Next-Generation Systems},
  url = {http://dx.doi.org/10.1109/2.7053},
  volume = 21,
  year = 1988
}

@inproceedings{Keally:2011:PTP:2070942.2070968,
  abstract = {The vast array of small wireless sensors is a boon to body sensor network applications, especially in the context awareness and activity recognition arena. However, most activity recognition deployments and applications are challenged to provide personal control and practical functionality for everyday use. We argue that activity recognition for mobile devices must meet several goals in order to provide a practical solution: user friendly hardware and software, accurate and efficient classification, and reduced reliance on ground truth. To meet these challenges, we present PBN: Practical Body Networking. Through the unification of TinyOS motes and Android smartphones, we combine the sensing power of on-body wireless sensors with the additional sensing power, computational resources, and user-friendly interface of an Android smartphone. We provide an accurate and efficient classification approach through the use of ensemble learning. We explore the properties of different sensors and sensor data to further improve classification efficiency and reduce reliance on user annotated ground truth. We evaluate our PBN system with multiple subjects over a two week period and demonstrate that the system is easy to use, accurate, and appropriate for mobile devices.},
  acmid = {2070968},
  address = {New York, NY, USA},
  author = {Keally, Matthew and Zhou, Gang and Xing, Guoliang and Wu, Jianxin and Pyles, Andrew},
  booktitle = {Proceedings of the 9th ACM Conference on Embedded Networked Sensor Systems},
  doi = {10.1145/2070942.2070968},
  interhash = {5e6a13d34026f65338cfa619054822c8},
  intrahash = {61e5e4559d031c4152b3f316c0aa5209},
  isbn = {978-1-4503-0718-5},
  location = {Seattle, Washington},
  numpages = {14},
  pages = {246--259},
  publisher = {ACM},
  series = {SenSys '11},
  title = {PBN: towards practical activity recognition using smartphone-based body sensor networks},
  url = {http://doi.acm.org/10.1145/2070942.2070968},
  year = 2011
}

@misc{backstrom2010supervised,
  abstract = {Predicting the occurrence of links is a fundamental problem in networks. In the link prediction problem we are given a snapshot of a network and would like to infer which interactions among existing members are likely to occur in the near future or which existing interactions are we missing. Although this problem has been extensively studied, the challenge of how to effectively combine the information from the network structure with rich node and edge attribute data remains largely open.   We develop an algorithm based on Supervised Random Walks that naturally combines the information from the network structure with node and edge level attributes. We achieve this by using these attributes to guide a random walk on the graph. We formulate a supervised learning task where the goal is to learn a function that assigns strengths to edges in the network such that a random walker is more likely to visit the nodes to which new links will be created in the future. We develop an efficient training algorithm to directly learn the edge strength estimation function.   Our experiments on the Facebook social graph and large collaboration networks show that our approach outperforms state-of-the-art unsupervised approaches as well as approaches that are based on feature extraction.},
  author = {Backstrom, L. and Leskovec, J.},
  interhash = {970b02221d407c64c1c35f997d4fe345},
  intrahash = {c5cc52fa016b384f9d7b5ae4da841d44},
  note = {cite arxiv:1011.4071},
  title = {Supervised Random Walks: Predicting and Recommending Links in Social   Networks},
  url = {http://arxiv.org/abs/1011.4071},
  year = 2010
}

@misc{bakshy2012social,
  abstract = {Online social networking technologies enable individuals to simultaneously share information with any number of peers. Quantifying the causal effect of these technologies on the dissemination of information requires not only identification of who influences whom, but also of whether individuals would still propagate information in the absence of social signals about that information. We examine the role of social networks in online information diffusion with a large-scale field experiment that randomizes exposure to signals about friends' information sharing among 253 million subjects in situ. Those who are exposed are significantly more likely to spread information, and do so sooner than those who are not exposed. We further examine the relative role of strong and weak ties in information propagation. We show that, although stronger ties are individually more influential, it is the more abundant weak ties who are responsible for the propagation of novel information. This suggests that weak ties may play a more dominant role in the dissemination of information online than currently believed.},
  author = {Bakshy, Eytan and Rosenn, Itamar and Marlow, Cameron and Adamic, Lada},
  interhash = {3cb3b908425601c6f41f35fbe1b583ff},
  intrahash = {e9778b31557c5de1d3fc2dbb9188513f},
  note = {cite arxiv:1201.4145Comment: 10 pages, 7 figures. In the Proceedings of ACM WWW 2012, April 16-20,  2012, Lyon, France},
  title = {The Role of Social Networks in Information Diffusion},
  url = {http://arxiv.org/abs/1201.4145},
  year = 2012
}

@article{10.1371/journal.pone.0019467,
  abstract = {        <p>Twitter is a free social networking and micro-blogging service that enables its                     millions of users to send and read each other's “tweets,” or                     short, 140-character messages. The service has more than 190 million registered                     users and processes about 55 million tweets per day. Useful information about                     news and geopolitical events lies embedded in the Twitter stream, which                     embodies, in the aggregate, Twitter users' perspectives and reactions to                     current events. By virtue of sheer volume, content embedded in the Twitter                     stream may be useful for tracking or even forecasting behavior if it can be                     extracted in an efficient manner. In this study, we examine the use of                     information embedded in the Twitter stream to (1) track rapidly-evolving public                     sentiment with respect to H1N1 or swine flu, and (2) track and measure actual                     disease activity. We also show that Twitter can be used as a measure of public                     interest or concern about health-related events. Our results show that estimates                     of influenza-like illness derived from Twitter chatter accurately track reported                     disease levels.</p>       },
  author = {Signorini, Alessio and Segre, Alberto Maria and Polgreen, Philip M.},
  doi = {10.1371/journal.pone.0019467},
  interhash = {56b199b8f3a3d085aef43e25b2aca06b},
  intrahash = {6c3bc3dabf1b1d0095774e87b14d3ad2},
  journal = {PLoS ONE},
  month = {05},
  number = 5,
  pages = {e19467},
  publisher = {Public Library of Science},
  title = {The Use of Twitter to Track Levels of Disease Activity and Public                     Concern in the U.S. during the Influenza A H1N1 Pandemic},
  url = {http://dx.doi.org/10.1371%2Fjournal.pone.0019467},
  volume = 6,
  year = 2011
}

@article{Haustein2011446,
  abstract = {Web 2.0 technologies are finding their way into academics: specialized social bookmarking services allow researchers to store and share scientific literature online. By bookmarking and tagging articles, academic prosumers generate new information about resources, i.e. usage statistics and content description of scientific journals. Given the lack of global download statistics, the authors propose the application of social bookmarking data to journal evaluation. For a set of 45 physics journals all 13,608 bookmarks from CiteULike, Connotea and BibSonomy to documents published between 2004 and 2008 were analyzed. This article explores bookmarking data in STM and examines in how far it can be used to describe the perception of periodicals by the readership. Four basic indicators are defined, which analyze different aspects of usage: Usage Ratio, Usage Diffusion, Article Usage Intensity and Journal Usage Intensity. Tags are analyzed to describe a reader-specific view on journal content.},
  author = {Haustein, Stefanie and Siebenlist, Tobias},
  doi = {10.1016/j.joi.2011.04.002},
  interhash = {13fe59aae3d6ef95b529ffe00ede4126},
  intrahash = {60170943fb293bcb54754710ec9dced1},
  issn = {1751-1577},
  journal = {Journal of Informetrics},
  number = 3,
  pages = {446 - 457},
  title = {Applying social bookmarking data to evaluate journal usage},
  url = {http://www.sciencedirect.com/science/article/pii/S1751157711000393},
  volume = 5,
  year = 2011
}

@misc{Kim2012,
  abstract = {  Sentiment analysis predicts the presence of positive or negative emotions in a text document. In this paper we consider higher dimensional extensions of the sentiment concept, which represent a richer set of human emotions. Our approach goes beyond previous work in that our model contains a continuous manifold rather than a finite set of human emotions. We investigate the resulting model, compare it to psychological observations, and explore its predictive capabilities. Besides obtaining significant improvements over a baseline without manifold, we are also able to visualize different notions of positive sentiment in different domains. },
  author = {Kim, Seungyeon and Li, Fuxin and Lebanon, Guy and Essa, Irfan},
  interhash = {78c5eda9e1ef2780d70234dc4942203f},
  intrahash = {d169c08d5241a0912f3d60c97d87e2c0},
  note = {cite arxiv:1202.1568Comment: 15 pages, 7 figures},
  title = {Beyond Sentiment: The Manifold of Human Emotions},
  url = {http://arxiv.org/abs/1202.1568},
  year = 2012
}

@article{Borges_DeCarvalho_Galante_Gonalves_Laender_2011,
  author = {Borges, Eduardo N and De Carvalho, Moisés G and Galante, Renata and Gonçalves, Marcos André and Laender, Alberto H F},
  interhash = {0271248d1218f087a643c4aa906607f9},
  intrahash = {e7bc9412f92dddbfd5eaf81648ac5849},
  journal = {Information Processing & Management},
  number = 5,
  pages = {706--718},
  publisher = {Elsevier Ltd},
  title = {An unsupervised heuristic-based approach for bibliographic metadata deduplication},
  url = {http://linkinghub.elsevier.com/retrieve/pii/S0306457311000100},
  volume = 47,
  year = 2011
}

@article{cousins1998duplicate,
  abstract = {COPAC is a union catalog giving access to the online catalog records of some of the largest academic research libraries in the United Kingdom and Ireland. Discussion includes ways in which duplicate detection and record consolidation procedures are carried out, along with problem areas encountered. (Author/AEF)},
  author = {Cousins, Shirley Anne},
  interhash = {6880df322e69a00af4df1466c7730e7a},
  intrahash = {a1067917a86f9aaaa1d5610ae113436c},
  issn = {01655515},
  journal = {Journal of Information Science},
  number = 4,
  pages = {231--40},
  refid = {EJ573940},
  title = {Duplicate Detection and Record Consolidation in Large Bibliographic Databases: The COPAC Database Experience.},
  url = {http://www.eric.ed.gov/ERICWebPortal/detail?accno=EJ573940},
  volume = 24,
  year = 1998
}

@inproceedings{conf/gfkl/PotthastS07,
  author = {Potthast, Martin and Stein, Benno},
  booktitle = {GfKl},
  crossref = {conf/gfkl/2007},
  editor = {Preisach, Christine and Burkhardt, Hans and Schmidt-Thieme, Lars and Decker, Reinhold},
  ee = {http://dx.doi.org/10.1007/978-3-540-78246-9_71},
  interhash = {3686fe6dcbfc3683234edb5d1d7aad05},
  intrahash = {2eeececfe9ce4c4956142231523df00a},
  isbn = {978-3-540-78239-1},
  pages = {601-609},
  publisher = {Springer},
  series = {Studies in Classification, Data Analysis, and Knowledge Organization},
  title = {New Issues in Near-duplicate Detection.},
  url = {http://www.uni-weimar.de/medien/webis/publications/papers/stein_2008d.pdf},
  year = 2007
}

@article{KamelBoulos:2011:Int-J-Health-Geogr:22188675,
  abstract = {ABSTRACT: 'Wikification of GIS by the masses' is a phrase-term first coined by Kamel Boulos in 2005, two years earlier than Goodchild's term 'Volunteered Geographic Information'. Six years later (2005-2011), OpenStreetMap and Google Earth (GE) are now full-fledged, crowdsourced 'Wikipedias of the Earth' par excellence, with millions of users contributing their own layers to GE, attaching photos, videos, notes and even 3-D (three dimensional) models to locations in GE. From using Twitter in participatory sensing and bicycle-mounted sensors in pervasive environmental sensing, to creating a 100,000-sensor geo-mashup using Semantic Web technology, to the 3-D visualisation of indoor and outdoor surveillance data in real-time and the development of next-generation, collaborative natural user interfaces that will power the spatially-enabled public health and emergency situation rooms of the future, where sensor data and citizen reports can be triaged and acted upon in  real-time by distributed teams of professionals, this paper offers a comprehensive state-of-the-art review of the overlapping domains of the Sensor Web, citizen sensing and 'human-in-the-loop sensing' in the era of the Mobile and Social Web, and the roles these domains can play in environmental and public health surveillance and crisis/disaster informatics. We provide an in-depth review of the key issues and trends in these areas, the challenges faced when reasoning and making decisions with real-time crowdsourced data (such as issues of information overload, "noise", misinformation, bias and trust), the core technologies and Open Geospatial Consortium (OGC) standards involved (Sensor Web Enablement and Open GeoSMS), as well as a few outstanding project implementation examples from around the world.},
  author = {Kamel Boulos, M N and Resch, B and Crowley, D N and Breslin, J G and Sohn, G and Burtner, R and Pike, W A and Jezierski, E and Chuang, K Y},
  doi = {10.1186/1476-072X-10-67},
  interhash = {a6ac2747114ce6ffa0292d35b13d090a},
  intrahash = {def3ae64f180754a5477d0561989501f},
  journal = {Int J Health Geogr},
  month = dec,
  number = 1,
  pages = {67-67},
  pmid = {22188675},
  title = {Crowdsourcing, citizen sensing and Sensor Web technologies for public and environmental health surveillance and crisis management: trends, OGC standards and application examples},
  url = {http://www.ncbi.nlm.nih.gov/pubmed/22188675},
  volume = 10,
  year = 2011
}

@misc{Tibely2012,
  abstract = {  Due to the increasing popularity of collaborative tagging systems, the research on tagged networks, hypergraphs, ontologies, folksonomies and other related concepts is becoming an important interdisciplinary topic with great actuality and relevance for practical applications. In most collaborative tagging systems the tagging by the users is completely &quot;flat&quot;, while in some cases they are allowed to define a shallow hierarchy for their own tags. However, usually no overall hierarchical organisation of the tags is given, and one of the interesting challenges of this area is to provide an algorithm generating the ontology of the tags from the available data. In contrast, there are also other type of tagged networks available for research, where the tags are already organised into a directed acyclic graph (DAG), encapsulating the &quot;is a sub-category of&quot; type of hierarchy between each other. In this paper we study how this DAG affects the statistical distribution of tags on the nodes marked by the tags in various real networks. We analyse the relation between the tag-frequency and the position of the tag in the DAG in two large sub-networks of the English Wikipedia and a protein-protein interaction network. We also study the tag co-occurrence statistics by introducing a 2d tag-distance distribution preserving both the difference in the levels and the absolute distance in the DAG for the co-occurring pairs of tags. Our most interesting finding is that the local relevance of tags in the DAG, (i.e., their rank or significance as characterised by, e.g., the length of the branches starting from them) is much more important than their global distance from the root. Furthermore, we also introduce a simple tagging model based on random walks on the DAG, capable of reproducing the main statistical features of tag co-occurrence. },
  author = {Tibely, Gergely and Pollner, Peter and Vicsek, Tamas and Palla, Gergely},
  interhash = {4df4eecbca062c3631b03ed8f72c4bef},
  intrahash = {3d1d1d6525a20c3ff484819e27a12c53},
  note = {cite arxiv:1201.1085 Comment: Submitted to New Journal of Physics},
  title = {Ontologies and tag-statistics},
  url = {http://arxiv.org/abs/1201.1085},
  year = 2012
}

@misc{Sarma2011,
  abstract = {  De-duplication---identification of distinct records referring to the same real-world entity---is a well-known challenge in data integration. Since very large datasets prohibit the comparison of every pair of records, {\em blocking} has been identified as a technique of dividing the dataset for pairwise comparisons, thereby trading off {\em recall} of identified duplicates for {\em efficiency}. Traditional de-duplication tasks, while challenging, typically involved a fixed schema such as Census data or medical records. However, with the presence of large, diverse sets of structured data on the web and the need to organize it effectively on content portals, de-duplication systems need to scale in a new dimension to handle a large number of schemas, tasks and data sets, while handling ever larger problem sizes. In addition, when working in a map-reduce framework it is important that canopy formation be implemented as a {\em hash function}, making the canopy design problem more challenging. We present CBLOCK, a system that addresses these challenges. CBLOCK learns hash functions automatically from attribute domains and a labeled dataset consisting of duplicates. Subsequently, CBLOCK expresses blocking functions using a hierarchical tree structure composed of atomic hash functions. The application may guide the automated blocking process based on architectural constraints, such as by specifying a maximum size of each block (based on memory requirements), impose disjointness of blocks (in a grid environment), or specify a particular objective function trading off recall for efficiency. As a post-processing step to automatically generated blocks, CBLOCK {\em rolls-up} smaller blocks to increase recall. We present experimental results on two large-scale de-duplication datasets at Yahoo!---consisting of over 140K movies and 40K restaurants respectively---and demonstrate the utility of CBLOCK. },
  author = {Sarma, Anish Das and Jain, Ankur and Machanavajjhala, Ashwin and Bohannon, Philip},
  interhash = {3f32848ef4bb26a3057c3feadff99c5a},
  intrahash = {389dba4432b1340211ef6be8e3d45a1d},
  note = {cite arxiv:1111.3689},
  title = {CBLOCK: An Automatic Blocking Mechanism for Large-Scale De-duplication   Tasks},
  url = {http://arxiv.org/abs/1111.3689},
  year = 2011
}

@article{1742-5468-2007-06-P06010,
  abstract = {To account for strong ageing characteristics of citation networks, we modify the PageRank algorithm by initially distributing random surfers exponentially with age, in favour of more recent publications. The output of this algorithm, which we call CiteRank, is interpreted as approximate traffic to individual publications in a simple model of how researchers find new information. We optimize parameters of our algorithm to achieve the best performance. The results are compared for two rather different citation networks: all American Physical Society publications between 1893 and 2003 and the set of high-energy physics theory (hep-th) preprints. Despite major differences between these two networks, we find that their optimal parameters for the CiteRank algorithm are remarkably similar. The advantages and performance of CiteRank over more conventional methods of ranking publications are discussed.},
  author = {Walker, Dylan and Xie, Huafeng and Yan, Koon-Kiu and Maslov, Sergei},
  interhash = {86853f761733eaea09a273027a6c3c4a},
  intrahash = {ed618f45800255b5a5179d36849cd0b4},
  journal = {Journal of Statistical Mechanics: Theory and Experiment},
  number = 06,
  pages = {P06010},
  title = {Ranking scientific publications using a model of network traffic},
  url = {http://stacks.iop.org/1742-5468/2007/i=06/a=P06010},
  volume = 2007,
  year = 2007
}

@inproceedings{boshmaf2011socialbot,
  abstract = {Online Social Networks (OSNs) have become an integral part of today's Web. Politicians, celebrities, revolutionists, and others use OSNs as a podium to deliver their message to millions of active web users. Unfortunately, in the wrong hands, OSNs can be used to run astroturf campaigns to spread misinformation and propaganda. Such campaigns usually start o� by in�ltrating a targeted OSN on a large scale. In this paper, we evaluate how vulnerable OSNs are to a large-scale in�ltration by socialbots: computer programs that control OSN accounts and mimic real users. We adopt a traditional web-based botnet design and built a Socialbot Network (SbN): a group of adaptive socialbots that are or- chestrated in a command-and-control fashion. We operated such an SbN on Facebook|a 750 million user OSN|for about 8 weeks. We collected data related to users' behav- ior in response to a large-scale in�ltration where socialbots were used to connect to a large number of Facebook users. Our results show that (1) OSNs, such as Facebook, can be in�ltrated with a success rate of up to 80%, (2) depending on users' privacy settings, a successful in�ltration can result in privacy breaches where even more users' data are exposed when compared to a purely public access, and (3) in prac- tice, OSN security defenses, such as the Facebook Immune System, are not e�ective enough in detecting or stopping a large-scale in�ltration as it occurs.},
  author = {Boshmaf, Yazan and Muslukhov, Ildar and Beznosov, Konstantin and Ripeanu, Matei},
  booktitle = {Proc. of the Annual Computer Security Applications Conference 2011},
  interhash = {d384da66292051fab7eca0372805c9af},
  intrahash = {a6ef16ba759ee4c56ccd4d017560344e},
  publisher = {ACM},
  title = {The Socialbot Network: When Bots Socialize for Fame and Money},
  url = {http://lersse-dl.ece.ubc.ca/record/264/files/ACSAC_2011.pdf},
  year = 2011
}

@inproceedings{conf/mldm/ToivonenVVBV01,
  author = {Toivonen, Jarmo and Visa, Ari and Vesanen, Tomi and Back, Barbro and Vanharanta, Hannu},
  booktitle = {MLDM},
  crossref = {conf/mldm/2001},
  editor = {Perner, Petra},
  ee = {http://dx.doi.org/10.1007/3-540-44596-X_15},
  interhash = {2121b03b46ecdde012bae15ca8cf8ce6},
  intrahash = {2f23db9219b4d693acf15d7401684499},
  isbn = {3-540-42359-1},
  pages = {184-195},
  publisher = {Springer},
  series = {Lecture Notes in Computer Science},
  title = {Validation of Text Clustering Based on Document Contents.},
  url = {http://dblp.uni-trier.de/db/conf/mldm/mldm2001.html#ToivonenVVBV01},
  volume = 2123,
  year = 2001
}

@book{metzler2011featurecentric,
  asin = {3642228976},
  author = {Metzler, Donald},
  dewey = {005},
  ean = {9783642228971},
  edition = 2012,
  interhash = {4e473a9657c556434612d006a5a21460},
  intrahash = {22e5fe8501844167b64a5aed595f4372},
  isbn = {3642228976},
  publisher = {Springer},
  title = {A Feature-Centric View of Information Retrieval},
  url = {http://www.amazon.com/Feature-Centric-View-Information-Retrieval/dp/3642228976},
  year = 2011
}

@inproceedings{rezel2010swefe,
  abstract = {This paper presents SWE-FE: a suite of methods to extend folksonomies to the worldwide Sensor Web in order to tackle the emergent data rich information poor (DRIP) syndrome afflicting most geospatial applications on the Internet. SWE-FE leverages the geospatial information associated with three key components of such collaborative tagging systems: tags, resources and users. Specifically, SWE-FE provides algorithms for: i) suggesting tags for users during the tag input stage; ii) generating tag maps which provides for serendipitous browsing; and iii) personalized searching within the folksonomy. We implement SWE-FE on the GeoCENS Sensor Web platform as a case study for assessing the efficacy of our methods. We outline the evaluation framework that we are currently employing to carry out this assessment.},
  author = {Rezel, R. and Liang, S.},
  booktitle = {2010 International Symposium on Collaborative Technologies and Systems (CTS)},
  doi = {10.1109/CTS.2010.5478494},
  interhash = {9eb696593932c517873232386f8f61bf},
  intrahash = {d5b71572c7fea6504a0c0a3d84a9ecf0},
  month = may,
  pages = {349--356},
  publisher = {IEEE},
  title = {SWE-FE: Extending folksonomies to the Sensor Web},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=5478494},
  year = 2010
}

@inproceedings{transybil2009,
  author = {Tran, D.N. and Min, B. and Li, J. and Subramanian, L.},
  interhash = {34d39d14be357a65eefa8207a3fb5856},
  intrahash = {40c3dea03e3e4c561db6bc4b34c6f3da},
  organization = {Citeseer},
  title = {Sybil-resilient online content rating},
  url = {http://scholar.google.com/scholar.bib?q=info:YVSgj4tFvzEJ:scholar.google.com/&output=citation&hl=de&as_sdt=0&scfhb=1&ct=citation&cd=0},
  year = 2009
}

@inproceedings{conf/icdm/YassineH10,
  author = {Yassine, Mohamed and Hajj, Hazem},
  booktitle = {ICDM Workshops},
  crossref = {conf/icdm/2010w},
  editor = {Fan, Wei and Hsu, Wynne and Webb, Geoffrey I. and Liu, Bing and Zhang, Chengqi and Gunopulos, Dimitrios and Wu, Xindong},
  ee = {http://dx.doi.org/10.1109/ICDMW.2010.75},
  interhash = {72ae8c258d6559e4a90370453ecc2acc},
  intrahash = {8b0afeee143cec94f3058c214ae38c6f},
  pages = {1136-1142},
  publisher = {IEEE Computer Society},
  title = {A Framework for Emotion Mining from Text in Online Social Networks.},
  url = {http://dblp.uni-trier.de/db/conf/icdm/icdmw2010.html#YassineH10},
  year = 2010
}

@article{Kammergruber2010a,
  abstract = {Digitally supported knowledge work, using tags for content organization, creates inherent challenges. In this paper we show the design of a corporate tagging framework facing these challenges. We describe the implementation of a thesaurus approach as a lightweight alternative to a more sophisticated ontology design. An RDF based architecture with a Web 2.0 style editor enables average users to enrich social tagging data with semantic relations. },
  author = {Kammergruber, Walter Christian and Ehms, Karsten},
  interhash = {d9907846571f0057629f0202bb4beb7d},
  intrahash = {37048201a69ba88c0d56dcbf8d7f758b},
  journal = {10th International Conference on Knowledge Management (I-KNOW '10)},
  owner = {woidda},
  pages = {11-18},
  title = {A Corporate Tagging Framework as Integration Service for Knowledge Workers},
  volume = 10,
  year = 2010
}

@inproceedings{Kim2008,
  address = {Berlin, Deutschland},
  author = {Kim, Hak Lae and Scerri, Simon and Breslin, John G. and Decker, Stefan and Kim, Hong Gee},
  booktitle = {{Proceedings of the 2008 International Conference on Dublin Core and Metadata Applications}},
  interhash = {9c5f5af6f47a1a563dbb405c5a58a3cc},
  intrahash = {7d3c3c2189394a8686ca9812d58bfe74},
  pages = {128--137},
  publisher = {{Dublin Core Metadata Initiative}},
  title = {{The State of the Art in Tag Ontologies: A Semantic Model for Tagging and Folksonomies}},
  year = 2008
}

@inproceedings{Bethard:2010:ICL:1871437.1871517,
  acmid = {1871517},
  address = {New York, NY, USA},
  author = {Bethard, Steven and Jurafsky, Dan},
  booktitle = {Proceedings of the 19th ACM international conference on Information and knowledge management},
  doi = {http://doi.acm.org/10.1145/1871437.1871517},
  interhash = {1cdf6c7da38af251279e9fb915266af2},
  intrahash = {369206c7472baeaa5ecefef586e16c6a},
  isbn = {978-1-4503-0099-5},
  location = {Toronto, ON, Canada},
  numpages = {10},
  pages = {609--618},
  publisher = {ACM},
  series = {CIKM '10},
  title = {Who should I cite: learning literature search models from citation behavior},
  url = {http://doi.acm.org/10.1145/1871437.1871517},
  year = 2010
}

@misc{Rubin2011,
  abstract = {  Machine learning approaches to multi-label document classification have (to date) largely relied on discriminative modeling techniques such as support vector machines. A drawback of these approaches is that performance rapidly drops off as the total number of labels and the number of labels per document increase. This problem is amplified when the label frequencies exhibit the type of highly skewed distributions that are often observed in real-world datasets. In this paper we investigate a class of generative statistical topic models for multi-label documents that associate individual word tokens with different labels. We investigate the advantages of this approach relative to discriminative models, particularly with respect to classification problems involving large numbers of relatively rare labels. We compare the performance of generative and discriminative approaches on document labeling tasks ranging from datasets with several thousand labels to datasets with tens of labels. The experimental results indicate that generative models can achieve competitive multi-label classification performance compared to discriminative methods, and have advantages for datasets with many labels and skewed label frequencies. },
  author = {Rubin, Timothy N. and Chambers, America and Smyth, Padhraic and Steyvers, Mark},
  interhash = {e09d5d8587756d460a5d834025e75aac},
  intrahash = {f8a5a3958ae264d19c7f5415eb7f0bce},
  note = {cite arxiv:1107.2462},
  title = {Statistical Topic Models for Multi-Label Document Classification},
  url = {http://arxiv.org/abs/1107.2462},
  year = 2011
}

@book{srivastava2009mining,
  abstract = {Giving a broad perspective of the field from numerous vantage points, 'Text Mining' focuses on statistical methods for text mining and analysis. It examines methods to automatically cluster and classify text documents and applies these methods in a variety of areas.},
  address = {Boca Raton, FL},
  author = {Srivastava, Asho and Sahami, Mehran.},
  interhash = {290eabe518274b6fbcc73a106a7d52a6},
  intrahash = {45ab79501c114299142864becfa6c841},
  isbn = {9781420059403 1420059408},
  pages = {--},
  publisher = {CRC Press},
  refid = {144226505},
  title = {Text mining : classification, clustering, and applications},
  url = {http://www.worldcat.org/search?qt=worldcat_org_all&q=9781420059403},
  year = 2009
}

@inproceedings{anagnostopoulos2011authority,
  author = {Anagnostopoulos, Aris and Brova, George and Terzi, Evimaria},
  booktitle = {Proceedings of the ECML/PKDD 2011},
  interhash = {4b69d0de5d0c542404c9eb387abb0ac2},
  intrahash = {eb4553d07c2975a62fff33e92646a7df},
  title = {Peer and Authority Pressure in Information-Propagation Models},
  year = 2011
}

@book{noauthororeditor2011privacy,
  editor = {Trepte, Sabine and Reinecke, Leonard},
  interhash = {0c1381abf25ce1766bf35b1d3b72d87b},
  intrahash = {6b40774e3fee58c844c9e059e77691df},
  isbn = {9783642215209 3642215203},
  pages = {--},
  publisher = {Springer-Verlag New York Inc},
  refid = {731921793},
  title = {Privacy Online Perspectives on Privacy and Self-disclosure in the Social Web.},
  url = {http://www.worldcat.org/search?qt=worldcat_org_all&q=9783642215209},
  year = 2011
}

@article{Song19022010,
  abstract = {A range of applications, from predicting the spread of human and electronic viruses to city planning and resource management in mobile communications, depend on our ability to foresee the whereabouts and mobility of individuals, raising a fundamental question: To what degree is human behavior predictable? Here we explore the limits of predictability in human dynamics by studying the mobility patterns of anonymized mobile phone users. By measuring the entropy of each individual’s trajectory, we find a 93% potential predictability in user mobility across the whole user base. Despite the significant differences in the travel patterns, we find a remarkable lack of variability in predictability, which is largely independent of the distance users cover on a regular basis.},
  author = {Song, Chaoming and Qu, Zehui and Blumm, Nicholas and Barabási, Albert-László},
  doi = {10.1126/science.1177170},
  eprint = {http://www.sciencemag.org/content/327/5968/1018.full.pdf},
  interhash = {f2611a08bf6db54f86e884c05f3cb5fb},
  intrahash = {a89330f8eb32ce62b5f5c9a2b4909f25},
  journal = {Science},
  number = 5968,
  pages = {1018-1021},
  title = {Limits of Predictability in Human Mobility},
  url = {http://www.sciencemag.org/content/327/5968/1018.abstract},
  volume = 327,
  year = 2010
}

@article{Panisson2011,
  abstract = {We report on a data-driven investigation aimed at understanding the dynamics of message spreading in a real-world dynamical network of human proximity. We use data collected by means of a proximity-sensing network of wearable sensors that we deployed at three different social gatherings, simultaneously involving several hundred individuals. We simulate a message spreading process over the recorded proximity network, focusing on both the topological and the temporal properties. We show that by using an appropriate technique to deal with the temporal heterogeneity of proximity events, a universal statistical pattern emerges for the delivery times of messages, robust across all the data sets. Our results are useful to set constraints for generic processes of data dissemination, as well as to validate established models of human mobility and proximity that are frequently used to simulate realistic behaviors.},
  author = {Panisson, André and Barrat, Alain and Cattuto, Ciro and den Broeck, Wouter Van and Ruffo, Giancarlo and Schifanella, Rossano},
  doi = {10.1016/j.adhoc.2011.06.003},
  interhash = {9f6f2d9c4e270adf9f97fd24fb9eaa07},
  intrahash = {25159ef75d2777b837bdea0eb063a4f1},
  issn = {1570-8705},
  journal = {Ad Hoc Networks},
  pages = { - },
  title = {On the Dynamics of Human Proximity for Data Diffusion in Ad-Hoc Networks},
  url = {http://www.sciencedirect.com/science/article/pii/S1570870511001272},
  volume = {In Press, Accepted Manuscript},
  year = 2011
}

@inproceedings{conf/www/SenVR09,
  author = {Sen, Shilad and Vig, Jesse and Riedl, John},
  booktitle = {WWW},
  crossref = {conf/www/2009},
  editor = {Quemada, Juan and León, Gonzalo and Maarek, Yoëlle S. and Nejdl, Wolfgang},
  ee = {http://doi.acm.org/10.1145/1526709.1526800},
  interhash = {4968b29a544394a5f9acd1bb8916e230},
  intrahash = {8d38bdb12f6f2f89bd3c34d200e48b72},
  isbn = {978-1-60558-487-4},
  pages = {671-680},
  publisher = {ACM},
  title = {Tagommenders: connecting users to items through tags.},
  url = {http://dblp.uni-trier.de/db/conf/www/www2009.html#SenVR09},
  year = 2009
}

@inproceedings{Cooper:2010:BCS:1807128.1807152,
  abstract = {While the use of MapReduce systems (such as Hadoop) for large scale data analysis has been widely recognized and studied, we have recently seen an explosion in the number of systems developed for cloud data serving. These newer systems address "cloud OLTP" applications, though they typically do not support ACID transactions. Examples of systems proposed for cloud serving use include BigTable, PNUTS, Cassandra, HBase, Azure, CouchDB, SimpleDB, Voldemort, and many others. Further, they are being applied to a diverse range of applications that differ considerably from traditional (e.g., TPC-C like) serving workloads. The number of emerging cloud serving systems and the wide range of proposed applications, coupled with a lack of apples-to-apples performance comparisons, makes it difficult to understand the tradeoffs between systems and the workloads for which they are suited. We present the "Yahoo! Cloud Serving Benchmark" (YCSB) framework, with the goal of facilitating performance comparisons of the new generation of cloud data serving systems. We define a core set of benchmarks and report results for four widely used systems: Cassandra, HBase, Yahoo!'s PNUTS, and a simple sharded MySQL implementation. We also hope to foster the development of additional cloud benchmark suites that represent other classes of applications by making our benchmark tool available via open source. In this regard, a key feature of the YCSB framework/tool is that it is extensible--it supports easy definition of new workloads, in addition to making it easy to benchmark new systems.},
  acmid = {1807152},
  address = {New York, NY, USA},
  author = {Cooper, Brian F. and Silberstein, Adam and Tam, Erwin and Ramakrishnan, Raghu and Sears, Russell},
  booktitle = {Proceedings of the 1st ACM symposium on Cloud computing},
  doi = {10.1145/1807128.1807152},
  interhash = {379999e8da039d731bfb9195691c08e8},
  intrahash = {dd14b6e7abc247836d50af16e87fe5bb},
  isbn = {978-1-4503-0036-0},
  location = {Indianapolis, Indiana, USA},
  numpages = {12},
  pages = {143--154},
  publisher = {ACM},
  series = {SoCC '10},
  title = {Benchmarking cloud serving systems with YCSB},
  url = {http://doi.acm.org/10.1145/1807128.1807152},
  year = 2010
}

@article{lindsey2011parsing,
  address = {Routledge},
  author = {Harrison, Jill Lindsey},
  interhash = {6b1e18823139ad15756bc200b40048ff},
  intrahash = {a40eaca13e7d7ea916275dda75413966},
  issn = {0894-1920},
  journal = {Society & Natural Resources: An International Journal},
  pages = {702 - 716},
  series = 7,
  title = {Parsing “Participation” in Action Research: Navigating the Challenges of Lay Involvement in Technically Complex Participatory Science Projects},
  url = {http://www.informaworld.com/10.1080/08941920903403115},
  volume = 24,
  year = 2011
}

@article{journals/siamrev/KoldaB09,
  author = {Kolda, Tamara G. and Bader, Brett W.},
  ee = {http://dx.doi.org/10.1137/07070111X},
  interhash = {b30bb2d42e1a05fc41370c50844822ad},
  intrahash = {6b115affb18f3f1f99411596c03787f8},
  journal = {SIAM Review},
  number = 3,
  pages = {455-500},
  title = {Tensor Decompositions and Applications.},
  url = {http://dblp.uni-trier.de/db/journals/siamrev/siamrev51.html#KoldaB09},
  volume = 51,
  year = 2009
}

@article{march06crane,
  author = {Crane, Gregory},
  doi = {10.1045/march2006-crane},
  interhash = {36d4825e3189d89195693d1449e9aaea},
  intrahash = {eea7ae2ac1480c84f87544f2942c28f2},
  issn = {1082-9873},
  journal = {D-Lib Magazine},
  month = {March },
  number = 3,
  title = {What Do You Do with a Million Books?},
  url = {http://www.dlib.org/dlib/march06/crane/03crane.html},
  volume = 12,
  year = 2006
}

@inproceedings{conf/sigir/GuanBMCW09,
  author = {Guan, Ziyu and Bu, Jiajun and Mei, Qiaozhu and Chen, Chun and Wang, Can},
  booktitle = {SIGIR},
  crossref = {conf/sigir/2009},
  editor = {Allan, James and Aslam, Javed A. and Sanderson, Mark and Zhai, ChengXiang and Zobel, Justin},
  ee = {http://doi.acm.org/10.1145/1571941.1572034},
  interhash = {53d2e8bc966048bc01efcc57b2fc8250},
  intrahash = {ac9427acf51cbf7cb5a35f66a16a32c0},
  isbn = {978-1-60558-483-6},
  pages = {540-547},
  publisher = {ACM},
  title = {Personalized tag recommendation using graph-based ranking on multi-type interrelated objects.},
  url = {http://www-personal.umich.edu/~qmei/pub/sigir09-tag.pdf},
  year = 2009
}

@article{journals/corr/abs-1103-0398,
  author = {Collobert, Ronan and Weston, Jason and Bottou, Léon and Karlen, Michael and Kavukcuoglu, Koray and Kuksa, Pavel P.},
  ee = {http://arxiv.org/abs/1103.0398},
  interhash = {c1e968fc1903e842ab3c638cd5ffca61},
  intrahash = {24c6f6531a70625136167307bc15a480},
  journal = {CoRR},
  note = {informal publication},
  title = {Natural Language Processing (almost) from Scratch},
  url = {http://static.googleusercontent.com/external_content/untrusted_dlcp/research.google.com/de//pubs/archive/35671.pdf},
  volume = {abs/1103.0398},
  year = 2011
}

@misc{Medus2011,
  abstract = {  The individual-based models constitute a set of widely implemented tools to
analyze the incidence of individuals heterogeneities in the spread of an
infectious disease. In this work we focus our attention on human contacts
heterogeneities through two of the main individual-based models: mobile agents
and complex networks models. We introduce a novel mobile agents model in which
individuals make displacements with sizes according to a truncated power-law
distribution based on empirical evidence about human mobility. Besides, we
present a procedure to obtain an equivalent weighted contact network from the
previous mobile agents model, where the weights of the links are interpreted as
contact probabilities. From the topological analysis of the equivalent contact
networks we show that small world characteristics are related with truncated
power-law distribution for agent displacements. Finally, we show the
equivalence between both approaches through some numerical experiments for the
spread of an infectious disease.
},
  author = {Medus, A. D. and Dorso, C. O.},
  interhash = {d2f8fefb3a26400deab44de456bd990c},
  intrahash = {8ad77fa71d773ea860280d528fd0b1c2},
  note = {cite arxiv:1104.4913
Comment: 19 pages, 7 figures, submitted to PRE},
  title = {Diseases spreading through individual based models with realistic
  mobility patterns},
  url = {http://arxiv.org/abs/1104.4913},
  year = 2011
}

@article{Elmagarmid2007Duplicate,
  abstract = {Often, in the real world, entities have two or more representations in databases. Duplicate records do not share a common key and/or they contain errors that make duplicate matching a difficult task. Errors are introduced as the result of transcription errors, incomplete information, lack of standard formats, or any combination of these factors. In this paper, we present a thorough analysis of the literature on duplicate record detection. We cover similarity metrics that are commonly used to detect similar field entries, and we present an extensive set of duplicate detection algorithms that can detect approximately duplicate records in a database. We also cover multiple techniques for improving the efficiency and scalability of approximate duplicate detection algorithms. We conclude with coverage of existing tools and with a brief discussion of the big open problems in the area},
  author = {Elmagarmid, A. K. and Ipeirotis, P. G. and Verykios, V. S.},
  citeulike-article-id = {1116298},
  interhash = {c8603198a5bd3d2e571462e08f50e12b},
  intrahash = {bfff8a370abdf14f7f882f87c1ff61e1},
  journal = {Knowledge and Data Engineering, IEEE Transactions on},
  number = 1,
  pages = {1--16},
  posted-at = {2008-02-06 12:37:40},
  priority = {5},
  title = {Duplicate Record Detection: A Survey},
  url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=4016511},
  volume = 19,
  year = 2007
}

@misc{Traud2011,
  abstract = {  We study the social structure of Facebook &quot;friendship&quot; networks at one
hundred American colleges and universities at a single point in time, and we
examine the roles of user attributes - gender, class year, major, high school,
and residence - at these institutions. We investigate the influence of common
attributes at the dyad level in terms of assortativity coefficients and
regression models. We then examine larger-scale groupings by detecting
communities algorithmically and comparing them to network partitions based on
the user characteristics. We thereby compare the relative importances of
different characteristics at different institutions, finding for example that
common high school is more important to the social organization of large
institutions and that the importance of common major varies significantly
between institutions. Our calculations illustrate how microscopic and
macroscopic perspectives give complementary insights on the social organization
at universities and suggest future studies to investigate such phenomena
further.
},
  author = {Traud, Amanda L. and Mucha, Peter J. and Porter, Mason A.},
  interhash = {2bb0d7d1589f4e651c07f4419bc68c02},
  intrahash = {8afd9e99551c5fc1343fcc47542dbef6},
  note = {cite arxiv:1102.2166
Comment: 82 pages (including many pages of tables), 8 multi-part figures,
  &quot;Facebook100&quot; data used in this paper is publicly available at
  http://people.maths.ox.ac.uk/~porterm/data/facebook100.zip},
  title = {Social Structure of Facebook Networks},
  url = {http://arxiv.org/abs/1102.2166},
  year = 2011
}

@article{Hill:September_2006:1061-8600:584,
  abstract = {A dynamic network is a special type of network composed of connected transactors which have repeated evolving interaction. Data on large dynamic networks such as telecommunications networks and the Internet are pervasive. However, representing dynamic networks in a manner that is conducive to efficient large-scale analysis is a challenge. In this article, we represent dynamic graphs using a data structure introduced in an earlier article. We advocate their representation because it accounts for the evolution of relationships between transactors through time, mitigates noise at the local transactor level, and allows for the removal of stale relationships. Our work improves on their heuristic arguments by formalizing the representation with three tunable parameters. In doing this, we develop a generic framework for evaluating and tuning any dynamic graph. We show that the storage saving approximations involved in the representation do not affect predictive performance, and typically improve it. We motivate our approach using a fraud detection example from the telecommunications industry, and demonstrate that we can outperform published results on the fraud detection task. In addition, we present a preliminary analysis on Web logs and e-mail networks.
},
  author = {Hill, Shawndra and Agarwal, Deepak K. and Bell, Robert and Volinsky, Chris},
  doi = {doi:10.1198/106186006X139162},
  interhash = {0bef8e24366140d674636ff4f032a8de},
  intrahash = {c4c90214919c4edb8da5d69b78e5180b},
  journal = {Journal of Computational &#38; Graphical Statistics},
  month = {September
},
  pages = {584-608(25)},
  title = {Building an Effective Representation for Dynamic Networks},
  url = {http://www.ingentaconnect.com/content/asa/jcgs/2006/00000015/00000003/art00006},
  volume = 15,
  year = 2006
}

@proceedings{Gunawardana2935,
  author = {Gunawardana, Asela and Shani, Guy},
  interhash = {441df9b673faf85aecc45babd8883069},
  intrahash = {49600df05a884106989d71dedcaa7e1b},
  page = {2935&minus;2962},
  series = 2935,
  title = { A Survey of Accuracy Evaluation Metrics of Recommendation Tasks },
  url = {http://jmlr.csail.mit.edu/papers/v10/gunawardana09a.html},
  volume = {v10},
  year = 2009
}

@inproceedings{Bollegala07semanticSearch,
  address = {New York, NY, USA},
  author = {Bollegala, Danushka and Matsuo, Yutaka and Ishizuka, Mitsuru},
  booktitle = {WWW '07: Proceedings of the 16th international conference on World Wide Web},
  doi = {http://doi.acm.org/10.1145/1242572.1242675},
  interhash = {46247eb09b5e87a6e5d4a8b2cf821ee7},
  intrahash = {c957aa2fd65df63c8c4af14b1fc827c5},
  isbn = {978-1-59593-654-7},
  location = {Banff, Alberta, Canada},
  pages = {757--766},
  publisher = {ACM},
  title = {Measuring semantic similarity between words using web search engines},
  year = 2007
}

@article{citeulike:8506476,
  abstract = {{Social tagging systems pose new challenges to developers of recommender systems. As observed by recent research, traditional implementations of classic recommender approaches, such as collaborative filtering, are not working well in this new context. To address these challenges, a number of research groups worldwide work on adapting these approaches to the specific nature of social tagging systems. In joining this stream of research, we have developed and evaluated two enhancements of user-based collaborative filtering algorithms to provide recommendations of articles on Cite ULike, a social tagging service for scientific articles. The result obtained after two phases of evaluation suggests that both enhancements are beneficial. Incorporating the number of raters into the algorithms, as we do in our NwCF approach, leads to an improvement of precision, while tag-based BM25 similarity measure, an alternative to Pearson correlation for calculating the similarity between users and their neighbors, increases the coverage of the recommendation process.}},
  address = {Los Alamitos, CA, USA},
  author = {Santander, Denis P. and Brusilovsky, Peter},
  citeulike-article-id = {8506476},
  citeulike-linkout-0 = {http://doi.ieeecomputersociety.org/10.1109/WI-IAT.2010.261},
  citeulike-linkout-1 = {http://dx.doi.org/10.1109/WI-IAT.2010.261},
  doi = {10.1109/WI-IAT.2010.261},
  interhash = {dd320da969151c01cf270976c0803274},
  intrahash = {2c8764f2fe11ef1ae43fc0a5b51301ae},
  isbn = {978-0-7695-4191-4},
  journal = {Web Intelligence and Intelligent Agent Technology, IEEE/WIC/ACM International Conference on},
  pages = {136--142},
  posted-at = {2011-01-05 00:19:36},
  priority = {0},
  publisher = {IEEE Computer Society},
  title = {{Improving Collaborative Filtering in Social Tagging Systems for the Recommendation of Scientific Articles}},
  url = {http://dx.doi.org/10.1109/WI-IAT.2010.261},
  volume = 1,
  year = 2010
}

@article{springerlink:10.1007/s00778-010-0204-8,
  abstract = {Graph conductance queries, also known as personalized PageRank and related to random walks with restarts, were originally proposed to assign a hyperlink-based prestige score to Web pages. More general forms of such queries are also very useful for ranking in entity-relation (ER) graphs used to represent relational, XML and hypertext data. Evaluation of PageRank usually involves a global eigen computation. If the graph is even moderately large, interactive response times may not be possible. Recently, the need for interactive PageRank evaluation has increased. The graph may be fully known only when the query is submitted. Browsing actions of the user may change some inputs to the PageRank computation dynamically. In this paper, we describe a system that analyzes query workloads and the ER graph, invests in limited offline indexing, and exploits those indices to achieve essentially constant-time query processing, even as the graph size scales. Our techniques—data and query statistics collection, index selection and materialization, and query-time index exploitation—have parallels in the extensive relational query optimization literature, but is applied to supporting novel graph data repositories. We report on experiments with five temporal snapshots of the CiteSeer ER graph having 74–702 thousand entity nodes, 0.17–1.16 million word nodes, 0.29–3.26 million edges between entities, and 3.29–32.8 million edges between words and entities. We also used two million actual queries from CiteSeer’s logs. Queries run 3–4 orders of magnitude faster than whole-graph PageRank, the gap growing with graph size. Index size is smaller than a text index. Ranking accuracy is 94–98% with reference to whole-graph PageRank.},
  address = {Berlin / Heidelberg},
  affiliation = {IIT Bombay, Powai, Mumbai, Maharashtra India},
  author = {Chakrabarti, Soumen and Pathak, Amit and Gupta, Manish},
  doi = {10.1007/s00778-010-0204-8},
  interhash = {96a2d92f703a13f77bae8f56372f3e1b},
  intrahash = {dcc951cd461fe1c454db7a738429d421},
  issn = {1066-8888},
  journal = {The VLDB Journal},
  keyword = {Computer Science},
  pages = {1-26},
  publisher = {Springer},
  title = {Index design and query processing for graph conductance search},
  url = {http://dx.doi.org/10.1007/s00778-010-0204-8},
  year = 2010
}

@book{noauthororeditoryahoo,
  abstract = {The past decade has witnessed the emergence of participatory Web and social media, bringing people together in many creative ways. Millions of users are playing, tagging, working, and socializing online, demonstrating new forms of collaboration, communication, and intelligence that were hardly imaginable just a short time ago. Social media also helps reshape business models, sway opinions and emotions, and opens up numerous possibilities to study human interaction and collective behavior in an unparalleled scale. This lecture, from a data mining perspective, introduces characteristics of social media, reviews representative tasks of computing with social media, and illustrates associated challenges. It introduces basic concepts, presents state-of-the-art algorithms with easy-to-understand examples, and recommends effective evaluation methods. In particular, we discuss graph-based community detection techniques and many important extensions that handle dynamic, heterogeneous networks in social media. We also demonstrate how discovered patterns of communities can be used for social media mining. The concepts, algorithms, and methods presented in this lecture can help harness the power of social media and support building socially-intelligent systems. This book is an accessible introduction to the study of \emph{community detection and mining in social media}. It is an essential reading for students, researchers, and practitioners in disciplines and applications where social media is a key source of data that piques our curiosity to understand, manage, innovate, and excel.

This book is supported by additional materials, including lecture slides, the complete set of figures, key references, some toy data sets used in the book, and the source code of representative algorithms. The readers are encouraged to visit the book website for the latest information.

Table of Contents: Social Media and Social Computing / Nodes, Ties, and Influence / Community Detection and Evaluation / Communities in Heterogeneous Networks / Social Media Mining },
  author = {Tang‌, Lei and Liu‌, Huan},
  doi = {10.2200/S00298ED1V01Y201009DMK003},
  interhash = {717f8b976eec1dc934a3b84675456f25},
  intrahash = {c4e1fa6bf2d52a237e5557640d87c970},
  title = {Community Detection and Mining in Social Media},
  url = {http://www.morganclaypool.com/doi/abs/10.2200/S00298ED1V01Y201009DMK003},
  year = 2010
}

@inproceedings{conf/wsdm/HeymannPG10,
  author = {Heymann, Paul and Paepcke, Andreas and Garcia-Molina, Hector},
  booktitle = {WSDM},
  crossref = {conf/wsdm/2010},
  editor = {Davison, Brian D. and Suel, Torsten and Craswell, Nick and Liu, Bing},
  ee = {http://doi.acm.org/10.1145/1718487.1718495},
  interhash = {d4f72ed57e6b99dbe32e18e218d81ef5},
  intrahash = {12579231cd5449f9a40cba9924975f09},
  isbn = {978-1-60558-889-6},
  pages = {51-60},
  publisher = {ACM},
  title = {Tagging human knowledge.},
  url = {http://dblp.uni-trier.de/db/conf/wsdm/wsdm2010.html#HeymannPG10},
  year = 2010
}

@inproceedings{moeslein2010towards,
  author = {Moeslein, Kathrin},
  editor = {Proceedings, AMCIS 2010},
  interhash = {02028d63753faf29e7bed05d4becc5b5},
  intrahash = {d3d2fcd50cb455b87ede304e727d266b},
  title = {Towards Research Collaboration – a Taxonomy of Social Research Network Sites},
  url = {http://aisel.aisnet.org/amcis2010/92/},
  year = 2010
}

@misc{Farrahi_discoveringhuman,
  abstract = {We present a framework to automatically discover people’s routines from information extracted by cell phones. The framework is built from a probabilistic topic model learned on novel bag type representations of activity-related cues (location, proximity and their temporal variations over a day) of peoples ’ daily routines. Using real-life data from the Reality Mining dataset, covering 68 000+ hours of human activities, we can successfully discover location-driven (from cell tower connections) and proximity-driven (from Bluetooth information) routines in an unsupervised manner. The resulting topics meaningfully characterize some of the underlying co-occurrence structure of the activities in the dataset, including “going to work early/late”, “being home all day”, “working constantly”, “working sporadically” and “meeting at lunch time”. 1.},
  author = {Farrahi, Katayoun and Gatica-perez, Daniel},
  interhash = {5e3f9c64f6fb9ba5226e3345acd3ddd8},
  intrahash = {4c905f2cfc5e88c271ebc4f10d47de30},
  title = {Discovering Human Routines from Cell Phone Data with Topic Models},
  url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.139.5105},
  year = 2010
}

@inproceedings{1622221,
  abstract = {The Web is a dynamic information environment. Web content changes regularly and people revisit Web pages frequently. But the tools used to access the Web, including browsers and search engines, do little to explicitly support these dynamics. In this paper we present DiffIE, a browser plug-in that makes content change explicit in a simple and lightweight manner. DiffIE caches the pages a person visits and highlights how those pages have changed when the person returns to them. We describe how we built a stable, reliable, and usable system, including how we created compact, privacy-preserving page representations to support fast difference detection. Via a longitudinal user study, we explore how DiffIE changed the way people dealt with changing content. We find that much of its benefit came not from exposing expected change, but rather from drawing attention to unexpected change and helping people build a richer understanding of the Web content they frequent.},
  address = {New York, NY, USA},
  author = {Teevan, Jaime and Dumais, Susan T. and Liebling, Daniel J. and Hughes, Richard L.},
  booktitle = {UIST '09: Proceedings of the 22nd annual ACM symposium on User interface software and technology},
  doi = {10.1145/1622176.1622221},
  interhash = {507f0548966b5f8f50963822c8279211},
  intrahash = {c4cb8b6a8573f285ec4d907dec856a18},
  isbn = {978-1-60558-745-5},
  location = {Victoria, BC, Canada},
  pages = {237--246},
  publisher = {ACM},
  title = {Changing how people view changes on the web},
  url = {http://portal.acm.org/citation.cfm?id=1622176.1622221},
  year = 2009
}

@book{Gentle:2007,
  abstract = {Bibliogr. S. [505] - 518},
  author = {Gentle, James E.},
  interhash = {2156bd85da160d6baf88b187fd1e6230},
  intrahash = {fccc8b26fcc1912304600c6410f241e5},
  isbn = {978-0-387-70872-0},
  opac = {http://opac.bibliothek.uni-kassel.de/DB=1/PPN?PPN=190806516},
  publisher = {Springer New York},
  title = {Matrix algebra},
  url = {http://opac.bibliothek.uni-kassel.de/DB=1/PPN?PPN=190806516},
  year = 2007
}

@article{journals/www/EdaYUU09,
  author = {Eda, Takeharu and Yoshikawa, Masatoshi and Uchiyama, Toshio and Uchiyama, Tadasu},
  ee = {http://dx.doi.org/10.1007/s11280-009-0069-1},
  interhash = {a560796c977bc7582017f662bf88c16d},
  intrahash = {ec3c256e7d1f24cd9d407d3ce7e41d96},
  journal = {World Wide Web},
  number = 4,
  pages = {421-440},
  title = {The Effectiveness of Latent Semantic Analysis for Building Up a Bottom-up Taxonomy from Folksonomy Tags.},
  url = {http://dblp.uni-trier.de/db/journals/www/www12.html#EdaYUU09},
  volume = 12,
  year = 2009
}

@article{1117458,
  abstract = {Event-based network data consists of sets of events over time, each of which may involve multiple entities. Examples include email traffic, telephone calls, and research publications (interpreted as co-authorship events). Traditional network analysis techniques, such as social network models, often aggregate the relational information from each event into a single static network. In contrast, in this paper we focus on the temporal nature of such data. In particular, we look at the problems of temporal link prediction and node ranking, and describe new methods that illustrate opportunities for data mining and machine learning techniques in this context. Experimental results are discussed for a large set of co-authorship events measured over multiple years, and a large corporate email data set spanning 21 months.},
  address = {New York, NY, USA},
  author = {O'Madadhain, Joshua and Hutchins, Jon and Smyth, Padhraic},
  doi = {10.1145/1117454.1117458},
  interhash = {97a718ab9fe24625f7389939d2608d31},
  intrahash = {89a23b31a476c4f3f771b5e3e4a8432c},
  issn = {1931-0145},
  journal = {SIGKDD Explor. Newsl.},
  number = 2,
  pages = {23--30},
  publisher = {ACM},
  title = {Prediction and ranking algorithms for event-based network data},
  url = {http://portal.acm.org/citation.cfm?id=1117458},
  volume = 7,
  year = 2005
}

@misc{Sonnenbichler2010,
  abstract = {  Web 2.0 is transforming the internet: Information consumers become
information producers and consumers at the same time. In virtual places like
Facebook, Youtube, discussion boards and weblogs diversificated topics, groups
and issues are propagated and discussed. Today an internet user is a member of
lots of communities at different virtual places. &quot;Real life&quot; group membership
and group behavior has been analyzed in science intensively in the last
decades. Most interestingly, to our knowledge, user roles and behavior have not
been adapted to the modern internet. In this work, we give a short overview of
traditional community roles. We adapt those models and apply them to virtual
online communities. We suggest a community membership life cycle model
describing roles a user can take during his membership in a community. Our
model is systematic and generic; it can be adapted to concrete communities in
the web. The knowledge of a community's life cycle allows influencing the group
structure: Stage transitions can be supported or harmed, e.g. to strengthen the
binding of a user to a site and keep communities alive.
},
  author = {Sonnenbichler, Andreas C.},
  interhash = {c244c29b978e6aa0b032a83597bd9744},
  intrahash = {6b808f2988784442ca0f0ab2560f851b},
  note = {cite arxiv:1006.4271
Comment: Presented at the International Network For Social Network Analysis
  (INSNA): Sunbelt Conference 2009, San Diego, California, USA. 9 pages, 6
  figures},
  title = {A Community Membership Life Cycle Model},
  url = {http://arxiv.org/abs/1006.4271},
  year = 2010
}

@misc{Karypis02multilevelhypergraph,
  abstract = {Introduction  Hypergraph partitioning is an important problem with extensive application to many areas, including VLSI design [Alpert and Kahng, 1995], efficient storage of large databases on disks [Shekhar and Liu, 1996], and data mining [Mobasher et al., 1996, Karypis et al., 1999b]. The problem is to partition the vertices of a hypergraph into k equal-size parts, such that the number of hyperedges connecting vertices in different parts is minimized.  During the course of VLSI circuit design and synthesis, it is important to be able to divide the system specification into clusters so that the inter-cluster connections are minimized. This step has many applications including design packaging, HDL-based synthesis, design optimization, rapid prototyping, simulation, and testing. Many rapid prototyping systems use partitioning to map a complex circuit onto hundreds of interconnected FPGAs. Such partitioning instances are challenging because the timing, area, and I/O resource utilization },
  author = {Karypis, George},
  interhash = {c79f1aad4b40640a346bd67fdd4eada3},
  intrahash = {e1d8b31de59731bbf41a8559c8cf9caa},
  title = {Multilevel Hypergraph Partitioning},
  url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.6.9117},
  year = 2002
}

@article{journals/kais/GuoV08,
  author = {Guo, Hongyu and Viktor, Herna L.},
  date = {2010-02-02},
  ee = {http://dx.doi.org/10.1007/s10115-008-0127-5},
  interhash = {430e66171506726f2478939162b789b6},
  intrahash = {35bee198e0b131ead007dd91d794ab78},
  journal = {Knowl. Inf. Syst.},
  number = 3,
  pages = {287-312},
  title = {Multirelational classification: a multiple view approach.},
  url = {http://dblp.uni-trier.de/db/journals/kais/kais17.html#GuoV08},
  volume = 17,
  year = 2008
}