@article{duartetorres2014analysis, abstract = {The Internet is increasingly used by young children for all kinds of purposes. Nonetheless, there are not many resources especially designed for children on the Internet and most of the content online is designed for grown-up users. This situation is problematic if we consider the large differences between young users and adults since their topic interests, computer skills, and language capabilities evolve rapidly during childhood. There is little research aimed at exploring and measuring the difficulties that children encounter on the Internet when searching for information and browsing for content. In the first part of this work, we employed query logs from a commercial search engine to quantify the difficulties children of different ages encounter on the Internet and to characterize the topics that they search for. We employed query metrics (e.g., the fraction of queries posed in natural language), session metrics (e.g., the fraction of abandoned sessions), and click activity (e.g., the fraction of ad clicks). The search logs were also used to retrace stages of child development. Concretely, we looked for changes in interests (e.g., the distribution of topics searched) and language development (e.g., the readability of the content accessed and the vocabulary size). In the second part of this work, we employed toolbar logs from a commercial search engine to characterize the browsing behavior of young users, particularly to understand the activities on the Internet that trigger search. We quantified the proportion of browsing and search activity in the toolbar sessions and we estimated the likelihood of a user to carry out search on the Web vertical and multimedia verticals (i.e., videos and images) given that the previous event is another search event or a browsing event. We observed that these metrics clearly demonstrate an increased level of confusion and unsuccessful search sessions among children. We also found a clear relation between the reading level of the clicked pages and characteristics of the users such as age and educational attainment. In terms of browsing behavior, children were found to start their activities on the Internet with a search engine (instead of directly browsing content) more often than adults. We also observed a significantly larger amount of browsing activity for the case of teenager users. Interestingly we also found that if children visit knowledge-related Web sites (i.e., information-dense pages such as Wikipedia articles), they subsequently do more Web searches than adults. Additionally, children and especially teenagers were found to have a greater tendency to engage in multimedia search, which calls to improve the aggregation of multimedia results into the current search result pages.}, acmid = {2555595}, address = {New York, NY, USA}, articleno = {7}, author = {Duarte Torres, Sergio and Weber, Ingmar and Hiemstra, Djoerd}, doi = {10.1145/2555595}, interhash = {d424ae624f1065d6616609d7356c4b21}, intrahash = {c3377980acb82b95a518eb7806f9c592}, issn = {1559-1131}, issue_date = {March 2014}, journal = {ACM Transactions on the Web}, month = mar, number = 2, numpages = {54}, pages = {7:1--7:54}, publisher = {ACM}, title = {Analysis of Search and Browsing Behavior of Young Users on the Web}, url = {http://doi.acm.org/10.1145/2555595}, volume = 8, year = 2014 } @article{thomas2014using, abstract = {A user's behaviour when browsing a Web site contains clues to that user's experience. It is possible to record some of these behaviours automatically, and extract signals that indicate a user is having trouble finding information. This allows for Web site analytics based on user experiences, not just page impressions. A series of experiments identified user browsing behaviours—such as time taken and amount of scrolling up a page—which predict navigation difficulty and which can be recorded with minimal or no changes to existing sites or browsers. In turn, patterns of page views correlate with these signals and these patterns can help Web authors understand where and why their sites are hard to navigate. A new software tool, “LATTE,” automates this analysis and makes it available to Web authors in the context of the site itself.}, acmid = {2656343}, address = {New York, NY, USA}, articleno = {24}, author = {Thomas, Paul}, doi = {10.1145/2656343}, interhash = {b570b16074de8ee1b2db9fcf1061d16b}, intrahash = {06035bc4bd2c62d7dba957ad4410d7b3}, issn = {1559-1131}, issue_date = {October 2014}, journal = {ACM Transactions on the Web}, month = nov, number = 4, numpages = {41}, pages = {24:1--24:41}, publisher = {ACM}, title = {Using Interaction Data to Explain Difficulty Navigating Online}, url = {http://doi.acm.org/10.1145/2656343}, volume = 8, year = 2014 } @article{newman2004finding, author = {Newman, M. E. J. and Girvan, M.}, doi = {10.1103/PhysRevE.69.026113}, interhash = {b9145040e35ccb4d2a0ce18105e64ff4}, intrahash = {1dbc30a1818aa74973f387162e485443}, journal = {Phys. Rev. E}, month = feb, number = 2, numpages = {15}, pages = 026113, publisher = {American Physical Society}, title = {Finding and evaluating community structure in networks}, url = {http://link.aps.org/doi/10.1103/PhysRevE.69.026113}, volume = 69, year = 2004 } @article{fortunato2010community, abstract = {The modern science of networks has brought significant advances to our understanding of complex systems. One of the most relevant features of graphs representing real systems is community structure, or clustering, i.e. the organization of vertices in clusters, with many edges joining vertices of the same cluster and comparatively few edges joining vertices of different clusters. Such clusters, or communities, can be considered as fairly independent compartments of a graph, playing a similar role like, e.g., the tissues or the organs in the human body. Detecting communities is of great importance in sociology, biology and computer science, disciplines where systems are often represented as graphs. This problem is very hard and not yet satisfactorily solved, despite the huge effort of a large interdisciplinary community of scientists working on it over the past few years. We will attempt a thorough exposition of the topic, from the definition of the main elements of the problem, to the presentation of most methods developed, with a special focus on techniques designed by statistical physicists, from the discussion of crucial issues like the significance of clustering and how methods should be tested and compared against each other, to the description of applications to real networks. }, author = {Fortunato, Santo}, doi = {http://dx.doi.org/10.1016/j.physrep.2009.11.002}, interhash = {9f6089e942903fc65309f77744c88109}, intrahash = {fddddfb8990e8ea824c8c4b62244f737}, issn = {0370-1573}, journal = {Physics Reports }, number = {3–5}, pages = {75 - 174}, title = {Community detection in graphs }, url = {http://www.sciencedirect.com/science/article/pii/S0370157309002841}, volume = 486, year = 2010 } @article{atzmueller2015descriptionoriented, abstract = {Abstract Communities can intuitively be defined as subsets of nodes of a graph with a dense structure in the corresponding subgraph. However, for mining such communities usually only structural aspects are taken into account. Typically, no concise nor easily interpretable community description is provided. For tackling this issue, this paper focuses on description-oriented community detection using subgroup discovery. In order to provide both structurally valid and interpretable communities we utilize the graph structure as well as additional descriptive features of the graph’s nodes. A descriptive community pattern built upon these features then describes and identifies a community, i.e., a set of nodes, and vice versa. Essentially, we mine patterns in the “description space” characterizing interesting sets of nodes (i.e., subgroups) in the “graph space”; the interestingness of a community is evaluated by a selectable quality measure. We aim at identifying communities according to standard community quality measures, while providing characteristic descriptions of these communities at the same time. For this task, we propose several optimistic estimates of standard community quality functions to be used for efficient pruning of the search space in an exhaustive branch-and-bound algorithm. We demonstrate our approach in an evaluation using five real-world data sets, obtained from three different social media applications. }, author = {Atzmueller, Martin and Doerfel, Stephan and Mitzlaff, Folke}, doi = {http://dx.doi.org/10.1016/j.ins.2015.05.008}, interhash = {d87cc381289cd86387b81ff5b8646cb5}, intrahash = {fb7a824e273ab34db22f49d54b5d1e12}, issn = {0020-0255}, journal = {Information Sciences }, number = 0, pages = { - }, title = {Description-oriented community detection using exhaustive subgroup discovery }, url = {http://www.sciencedirect.com/science/article/pii/S0020025515003667}, year = 2015 } @article{agrawal1993mining, acmid = {170072}, address = {New York, NY, USA}, author = {Agrawal, Rakesh and Imieli\'{n}ski, Tomasz and Swami, Arun}, doi = {10.1145/170036.170072}, interhash = {53341ce3e6ce51c3bcf8b0219ec239b5}, intrahash = {8730417fb8d6eff31b43254b67d09f83}, issn = {0163-5808}, issue_date = {June 1, 1993}, journal = {SIGMOD Rec.}, month = jun, number = 2, numpages = {10}, pages = {207--216}, publisher = {ACM}, title = {Mining Association Rules Between Sets of Items in Large Databases}, url = {http://doi.acm.org/10.1145/170036.170072}, volume = 22, year = 1993 } @article{pasquier1999efficient, abstract = {Discovering association rules is one of the most important task in data mining. Many efficient algorithms have been proposed in the literature. The most noticeable are Apriori, Mannila's algorithm, Partition, Sampling and DIC, that are all based on the Apriori mining method: pruning the subset lattice (itemset lattice). In this paper we propose an efficient algorithm, called Close, based on a new mining method: pruning the closed set lattice (closed itemset lattice). This lattice, which is a sub-order of the subset lattice, is closely related to Wille's concept lattice in formal concept analysis. Experiments comparing Close to an optimized version of Apriori showed that Close is very efficient for mining dense and/or correlated data such as census style data, and performs reasonably well for market basket style data. }, author = {Pasquier, Nicolas and Bastide, Yves and Taouil, Rafik and Lakhal, Lotfi}, doi = {http://dx.doi.org/10.1016/S0306-4379(99)00003-4}, interhash = {14f55460561c18e8d47a1ffaad6bb738}, intrahash = {dd4cac14856b487b0819ebe042301d56}, issn = {0306-4379}, journal = {Information Systems }, number = 1, pages = {25 - 46}, title = {Efficient mining of association rules using closed itemset lattices }, url = {http://www.sciencedirect.com/science/article/pii/S0306437999000034}, volume = 24, year = 1999 } @article{fu2014academic, abstract = {By means of their academic publications, authors form a social network. Instead of sharing casual thoughts and photos (as in Facebook), authors select co-authors and reference papers written by other authors. Thanks to various efforts (such as Microsoft Academic Search and DBLP), the data necessary for analyzing the academic social network is becoming more available on the Internet. What type of information and queries would be useful for users to discover, beyond the search queries already available from services such as Google Scholar? In this paper, we explore this question by defining a variety of ranking metrics on different entities—authors, publication venues, and institutions. We go beyond traditional metrics such as paper counts, citations, and h-index. Specifically, we define metrics such as }, author = {Fu, TomZ.J. and Song, Qianqian and Chiu, DahMing}, doi = {10.1007/s11192-014-1356-x}, interhash = {a39d784173e693ac65979737e96c2a3c}, intrahash = {de2f3434421912af52e355578e147b0a}, issn = {0138-9130}, journal = {Scientometrics}, language = {English}, number = 1, pages = {203-239}, publisher = {Springer Netherlands}, title = {The academic social network}, url = {http://dx.doi.org/10.1007/s11192-014-1356-x}, volume = 101, year = 2014 } @article{masbleda2014highly, abstract = {Academics can now use the web and the social websites to disseminate scholarly information in a variety of different ways. Although some scholars have taken advantage of these new online opportunities, it is not clear how widespread their uptake is or how much impact they can have. This study assesses the extent to which successful scientists have social web presences, focusing on one influential group: highly cited researchers working at European institutions. It also assesses the impact of these presences. We manually and systematically identified if the European highly cited researchers had profiles in Google Scholar, Microsoft Academic Search, Mendeley, Academia and LinkedIn or any content in SlideShare. We then used URL mentions and altmetric indicators to assess the impact of the web presences found. Although most of the scientists had an institutional website of some kind, few had created a profile in any social website investigated, and LinkedIn—the only non-academic site in the list—was the most popular. Scientists having one kind of social web profile were more likely to have another in many cases, especially in the life sciences and engineering. In most cases it was possible to estimate the relative impact of the profiles using a readily available statistic and there were disciplinary differences in the impact of the different kinds of profiles. Most social web profiles had some evidence of uptake, if not impact; nevertheless, the value of the indicators used is unclear.}, author = {Mas-Bleda, Amalia and Thelwall, Mike and Kousha, Kayvan and Aguillo, IsidroF.}, doi = {10.1007/s11192-014-1345-0}, interhash = {5110401b47f90128cbe885cf441ab7fb}, intrahash = {9fa40f587b142513785037b67040abe4}, issn = {0138-9130}, journal = {Scientometrics}, language = {English}, number = 1, pages = {337-356}, publisher = {Springer Netherlands}, title = {Do highly cited researchers successfully use the social web?}, url = {http://dx.doi.org/10.1007/s11192-014-1345-0}, volume = 101, year = 2014 } @presentation{petersen2008qualitative, author = {Petersen, Wiebke and Heinrich, Petja}, howpublished = {presented at the 32th annual meeting of the Classification Society in Hamburg }, interhash = {cb0721cb3aa84ba0898e3afc784559c8}, intrahash = {cd5002cf2f214097cb7ff855b4af5db3}, month = {07}, title = {Qualitative Citation Analysis Based on Formal Concept Analysis}, url = {http://user.phil-fak.uni-duesseldorf.de/~petersen/slides/Petersen_Heinrich_GFKL2008_slides.pdf}, year = 2008 } @article{brzezinski2015power, abstract = {Modeling distributions of citations to scientific papers is crucial for understanding how science develops. However, there is a considerable empirical controversy on which statistical model fits the citation distributions best. This paper is concerned with rigorous empirical detection of power-law behaviour in the distribution of citations received by the most highly cited scientific papers. We have used a large, novel data set on citations to scientific papers published between 1998 and 2002 drawn from Scopus. The power-law model is compared with a number of alternative models using a likelihood ratio test. We have found that the power-law hypothesis is rejected for around half of the Scopus fields of science. For these fields of science, the Yule, power-law with exponential cut-off and log-normal distributions seem to fit the data better than the pure power-law model. On the other hand, when the power-law hypothesis is not rejected, it is usually empirically indistinguishable from most of the alternative models. The pure power-law model seems to be the best model only for the most highly cited papers in “Physics and Astronomy”. Overall, our results seem to support theories implying that the most highly cited scientific papers follow the Yule, power-law with exponential cut-off or log-normal distribution. Our findings suggest also that power laws in citation distributions, when present, account only for a very small fraction of the published papers (less than 1 % for most of science fields) and that the power-law scaling parameter (exponent) is substantially higher (from around 3.2 to around 4.7) than found in the older literature.}, author = {Brzezinski, Michal}, doi = {10.1007/s11192-014-1524-z}, interhash = {b162eddb3ff76a9eef5daf450da934c0}, intrahash = {8ef9a6fbfcca3d599ca500cf4f9a2e39}, issn = {0138-9130}, journal = {Scientometrics}, language = {English}, number = 1, pages = {213-228}, publisher = {Springer Netherlands}, title = {Power laws in citation distributions: evidence from Scopus}, url = {http://dx.doi.org/10.1007/s11192-014-1524-z}, volume = 103, year = 2015 } @article{clauset2009powerlaw, author = {Clauset, Aaron and Shalizi, Cosma Rohilla and Newman, M. E. J.}, doi = {10.1137/070710111}, eprint = {http://dx.doi.org/10.1137/070710111}, interhash = {9ce8658af5a6358a758bfdb819f73394}, intrahash = {c0097d202655474b1db6811ddea03410}, journal = {SIAM Review}, number = 4, pages = {661-703}, title = {Power-Law Distributions in Empirical Data}, url = {/brokenurl# http://dx.doi.org/10.1137/070710111 }, volume = 51, year = 2009 } @article{vuong1989likelihood, abstract = {In this paper, we develop a classical approach to model selection. Using the Kullback-Leibler Information Criterion to measure the closeness of a model to the truth, we propose simple likelihood-ratio based statistics for testing the null hypothesis that the competing models are equally close to the true data generating process against the alternative hypothesis that one model is closer. The tests are directional and are derived successively for the cases where the competing models are non-nested, overlapping, or nested and whether both, one, or neither is misspecified. As a prerequisite, we fully characterize the asymptotic distribution of the likelihood ratio statistic under the most general conditions. We show that it is a weighted sum of chi-square distribution or a normal distribution depending on whether the distributions in the competing models closest to the truth are observationally identical. We also propose a test of this latter condition.}, author = {Vuong, Quang H.}, copyright = {Copyright © 1989 The Econometric Society}, interhash = {e00a4353cb1b1241e5d3c52f531be8bd}, intrahash = {6888912f6666d4de22bdc794a05dfa1b}, issn = {00129682}, journal = {Econometrica}, jstor_articletype = {research-article}, jstor_formatteddate = {Mar., 1989}, language = {English}, number = 2, pages = {pp. 307-333}, publisher = {The Econometric Society}, title = {Likelihood Ratio Tests for Model Selection and Non-Nested Hypotheses}, url = {http://www.jstor.org/stable/1912557}, volume = 57, year = 1989 } @article{albarrn2011references, abstract = {This article studies massive evidence about references made and citations received after a 5-year citation window by 3.7 million articles published in 1998 to 2002 in 22 scientific fields. We find that the distributions of references made and citations received share a number of basic features across sciences. Reference distributions are rather skewed to the right while citation distributions are even more highly skewed: The mean is about 20 percentage points to the right of the median, and articles with a remarkable or an outstanding number of citations represent about 9% of the total. Moreover, the existence of a power law representing the upper tail of citation distributions cannot be rejected in 17 fields whose articles represent 74.7% of the total. Contrary to the evidence in other contexts, the value of the scale parameter is above 3.5 in 13 of the 17 cases. Finally, power laws are typically small, but capture a considerable proportion of the total citations received.}, author = {Albarrán, Pedro and Ruiz-Castillo, Javier}, doi = {10.1002/asi.21448}, interhash = {79502663727fcbd4834a423f4e3212a3}, intrahash = {f20e50e960696bab3b39b628718dd850}, issn = {1532-2890}, journal = {Journal of the American Society for Information Science and Technology}, number = 1, pages = {40--49}, publisher = {Wiley Subscription Services, Inc., A Wiley Company}, title = {References made and citations received by scientific articles}, url = {http://dx.doi.org/10.1002/asi.21448}, volume = 62, year = 2011 } @article{cerinek2015network, abstract = {We analyze the data about works (papers, books) from the time period 1990–2010 that are collected in Zentralblatt MATH database. The data were converted into four 2-mode networks (works }, author = {Cerinšek, Monika and Batagelj, Vladimir}, doi = {10.1007/s11192-014-1419-z}, interhash = {e65f748684210857bb19dc7f69d65f86}, intrahash = {bcba93fd0e6381289c489cbab20bbec7}, issn = {0138-9130}, journal = {Scientometrics}, language = {English}, number = 1, pages = {977-1001}, publisher = {Springer Netherlands}, title = {Network analysis of Zentralblatt MATH data}, url = {http://dx.doi.org/10.1007/s11192-014-1419-z}, volume = 102, year = 2015 } @misc{alstott2013powerlaw, abstract = {Power laws are theoretically interesting probability distributions that are also frequently used to describe empirical data. In recent years effective statistical methods for fitting power laws have been developed, but appropriate use of these techniques requires significant programming and statistical insight. In order to greatly decrease the barriers to using good statistical methods for fitting power law distributions, we developed the powerlaw Python package. This software package provides easy commands for basic fitting and statistical analysis of distributions. Notably, it also seeks to support a variety of user needs by being exhaustive in the options available to the user. The source code is publicly available and easily extensible.}, author = {Alstott, Jeff and Bullmore, Ed and Plenz, Dietmar}, doi = {10.1371/journal.pone.0085777}, interhash = {3e00fb5f61ea9069884122a61ca60c1f}, intrahash = {5c2f8406c2fca10773f28e538fbc115d}, note = {cite arxiv:1305.0215Comment: 18 pages, 6 figures, code and supporting information at https://github.com/jeffalstott/powerlaw and https://pypi.python.org/pypi/powerlaw}, title = {Powerlaw: a Python package for analysis of heavy-tailed distributions}, url = {http://arxiv.org/abs/1305.0215}, year = 2013 } @article{duquenne1986famille, author = {Guigues, J. L. and Duquenne, Vincent}, interhash = {ed5aac1ce34fbb024fe280628d3634fe}, intrahash = {863a47ad7e52e4ec62c7445b10fad4f9}, journal = {Math\'{e}matiques et Sciences Humaines}, number = 95, pages = {5--18}, title = {Famille minimale d'implications informatives r\'{e}\-sultant d'un tableau de donn\'{e}es binaires}, volume = 24, year = 1986 } @incollection{ganter2007relational, abstract = {Galois connections can be defined for lattices and for ordered sets. We discuss a rather wide generalisation, which was introduced by Weiqun Xia and has been reinvented under different names: Relational Galois connections between relations. It turns out that the generalised notion is of importance for the original one and can be utilised, e.g., for computing Galois connections.}, author = {Ganter, Bernhard}, booktitle = {Formal Concept Analysis}, doi = {10.1007/978-3-540-70901-5_1}, editor = {Kuznetsov, Sergei O. and Schmidt, Stefan}, interhash = {03a1b0464331a319f3ef868bc280aa67}, intrahash = {5d4d028f5cc93398e2acb5d3cb939bfc}, isbn = {978-3-540-70828-5}, language = {English}, pages = {1-17}, publisher = {Springer Berlin Heidelberg}, series = {Lecture Notes in Computer Science}, title = {Relational Galois Connections}, url = {http://dx.doi.org/10.1007/978-3-540-70901-5_1}, volume = 4390, year = 2007 } @article{glushko2008categorization, abstract = {In studying categorization, cognitive science has focused primarily on cultural categorization, ignoring individual and institutional categorization. Because recent technological developments have made individual and institutional classification systems much more available and powerful, our understanding of the cognitive and social mechanisms that produce these systems is increasingly important. Furthermore, key aspects of categorization that have received little previous attention emerge from considering diverse types of categorization together, such as the social factors that create stability in classification systems, and the interoperability that shared conceptual systems establish between agents. Finally, the profound impact of recent technological developments on classification systems indicates that basic categorization mechanisms are highly adaptive, producing new classification systems as the situations in which they operate change. }, author = {Glushko, Robert J. and Maglio, Paul P. and Matlock, Teenie and Barsalou, Lawrence W.}, doi = {http://dx.doi.org/10.1016/j.tics.2008.01.007}, interhash = {f21366054d9e9524e0f90a17e59aca97}, intrahash = {09d23ef22bc50a6c21735e5332f1ecdb}, issn = {1364-6613}, journal = {Trends in Cognitive Sciences }, number = 4, pages = {129 - 135}, title = {Categorization in the wild }, url = {http://www.sciencedirect.com/science/article/pii/S1364661308000557}, volume = 12, year = 2008 } @inproceedings{ames2007motivations, abstract = {Why do people tag? Users have mostly avoided annotating media such as photos -- both in desktop and mobile environments -- despite the many potential uses for annotations, including recall and retrieval. We investigate the incentives for annotation in Flickr, a popular web-based photo-sharing system, and ZoneTag, a cameraphone photo capture and annotation tool that uploads images to Flickr. In Flickr, annotation (as textual tags) serves both personal and social purposes, increasing incentives for tagging and resulting in a relatively high number of annotations. ZoneTag, in turn, makes it easier to tag cameraphone photos that are uploaded to Flickr by allowing annotation and suggesting relevant tags immediately after capture. A qualitative study of ZoneTag/Flickr users exposed various tagging patterns and emerging motivations for photo annotation. We offer a taxonomy of motivations for annotation in this system along two dimensions (sociality and function), and explore the various factors that people consider when tagging their photos. Our findings suggest implications for the design of digital photo organization and sharing applications, as well as other applications that incorporate user-based annotation.}, acmid = {1240772}, address = {New York, NY, USA}, author = {Ames, Morgan and Naaman, Mor}, booktitle = {Proceedings of the SIGCHI Conference on Human Factors in Computing Systems}, doi = {10.1145/1240624.1240772}, interhash = {bd24c17d66d2b904b3fc9444c2b64b44}, intrahash = {c3840b12cf9592a782a09ab9e1bdf49e}, isbn = {978-1-59593-593-9}, location = {San Jose, California, USA}, numpages = {10}, pages = {971--980}, publisher = {ACM}, series = {CHI '07}, title = {Why We Tag: Motivations for Annotation in Mobile and Online Media}, url = {http://doi.acm.org/10.1145/1240624.1240772}, year = 2007 }