@inproceedings{he2022design,
  abstract = {Continual learning can enable neural networks to evolve by learning new tasks sequentially in task-changing scenarios. 
However, two general challenges should be overcome in further research before we apply this technique to real-world applications. 
Firstly, newly collected novelties from the data stream in applications could contain anomalies that are meaningless for continual learning. 
Instead of viewing them as a new task for updating, we have to filter out such anomalies to reduce the disturbance of extremely high-entropy data for the progression of convergence. 
Secondly, fewer efforts have been put into research regarding the explainability of continual learning, which leads to a lack of transparency and credibility of the updated neural networks. 
Elaborated explanations about the process and result of continual learning can help experts in judgment and making decisions. 
Therefore, we propose the conceptual design of an explainability module with experts in the loop based on techniques, such as dimension reduction, visualization, and evaluation strategies. 
This work aims to overcome the mentioned challenges by sufficiently explaining and visualizing the identified anomalies and the updated neural network. 
With the help of this module, experts can be more confident in decision-making regarding anomaly filtering, dynamic adjustment of hyperparameters, data backup, etc.},
  author = {He, Yujiang and Huang, Zhixin and Sick, Bernhard},
  booktitle = {Workshop on Interactive Machine Learning Workshop (IMLW), AAAI},
  interhash = {e1229de8e285fd3b266fd73c3f5287c1},
  intrahash = {040c3c0e2cae3bcf668f6c6a67aed6be},
  pages = {1--6},
  title = {Design of Explainability Module with Experts in the Loop for Visualization and Dynamic Adjustment of Continual Learning},
  url = {https://arxiv.org/abs/2202.06781},
  year = 2022
}

@article{adomavicius2012impact,
  abstract = {This article investigates the impact of rating data characteristics on the performance of several popular recommendation algorithms, including user-based and item-based collaborative filtering, as well as matrix factorization. We focus on three groups of data characteristics: rating space, rating frequency distribution, and rating value distribution. A sampling procedure was employed to obtain different rating data subsamples with varying characteristics; recommendation algorithms were used to estimate the predictive accuracy for each sample; and linear regression-based models were used to uncover the relationships between data characteristics and recommendation accuracy. Experimental results on multiple rating datasets show the consistent and significant effects of several data characteristics on recommendation accuracy.},
  acmid = {2151166},
  address = {New York, NY, USA},
  articleno = {3},
  author = {Adomavicius, Gediminas and Zhang, Jingjing},
  doi = {10.1145/2151163.2151166},
  interhash = {53e424cc9502ebb33d38de1d04230196},
  intrahash = {e41453a56391ca382f2298607b361208},
  issn = {2158-656X},
  issue_date = {April 2012},
  journal = {ACM Trans. Manage. Inf. Syst.},
  month = apr,
  number = 1,
  numpages = {17},
  pages = {3:1--3:17},
  publisher = {ACM},
  title = {Impact of Data Characteristics on Recommender Systems Performance},
  url = {http://doi.acm.org/10.1145/2151163.2151166},
  volume = 3,
  year = 2012
}

@inproceedings{cremonesi2010performance,
  abstract = {In many commercial systems, the 'best bet' recommendations are shown, but the predicted rating values are not. This is usually referred to as a top-N recommendation task, where the goal of the recommender system is to find a few specific items which are supposed to be most appealing to the user. Common methodologies based on error metrics (such as RMSE) are not a natural fit for evaluating the top-N recommendation task. Rather, top-N performance can be directly measured by alternative methodologies based on accuracy metrics (such as precision/recall). An extensive evaluation of several state-of-the art recommender algorithms suggests that algorithms optimized for minimizing RMSE do not necessarily perform as expected in terms of top-N recommendation task. Results show that improvements in RMSE often do not translate into accuracy improvements. In particular, a naive non-personalized algorithm can outperform some common recommendation approaches and almost match the accuracy of sophisticated algorithms. Another finding is that the very few top popular items can skew the top-N performance. The analysis points out that when evaluating a recommender algorithm on the top-N recommendation task, the test set should be chosen carefully in order to not bias accuracy metrics towards non-personalized solutions. Finally, we offer practitioners new variants of two collaborative filtering algorithms that, regardless of their RMSE, significantly outperform other recommender algorithms in pursuing the top-N recommendation task, with offering additional practical advantages. This comes at surprise given the simplicity of these two methods.},
  acmid = {1864721},
  address = {New York, NY, USA},
  author = {Cremonesi, Paolo and Koren, Yehuda and Turrin, Roberto},
  booktitle = {Proceedings of the Fourth ACM Conference on Recommender Systems},
  doi = {10.1145/1864708.1864721},
  interhash = {04cb3373b65b03e03225f447250e7873},
  intrahash = {aeab7f02942cfeb97ccc7ae0a1d60801},
  isbn = {978-1-60558-906-0},
  location = {Barcelona, Spain},
  numpages = {8},
  pages = {39--46},
  publisher = {ACM},
  series = {RecSys '10},
  title = {Performance of Recommender Algorithms on Top-n Recommendation Tasks},
  url = {http://doi.acm.org/10.1145/1864708.1864721},
  year = 2010
}

@inproceedings{korner2010categorizers,
  abstract = {While recent research has advanced our understanding about the structure and dynamics of social tagging systems, we know little about (i) the underlying motivations for tagging (why users tag), and (ii) how they influence the properties of resulting tags and folksonomies. In this paper, we focus on problem (i) based on a distinction between two types of user motivations that we have identified in earlier work: Categorizers vs. Describers. To that end, we systematically define and evaluate a number of measures designed to discriminate between describers, i.e. users who use tags for describing resources as opposed to categorizers, i.e. users who use tags for categorizing resources. Subsequently, we present empirical findings from qualitative and quantitative evaluations of the measures on real world tagging behavior. In addition, we conducted a recommender evaluation in which we study the effectiveness of each of the presented measures and found the measure based on the tag content to be the most accurate in predicting the user behavior closely followed by a content independent measure. The overall contribution of this paper is the presentation of empirical evidence that tagging motivation can be approximated with simple statistical measures. Our research is relevant for (a) designers of tagging systems aiming to better understand the motivations of their users and (b) researchers interested in studying the effects of users' tagging motivation on the properties of resulting tags and emergent structures in social tagging systems},
  acmid = {1810645},
  address = {New York, NY, USA},
  author = {K\"{o}rner, Christian and Kern, Roman and Grahsl, Hans-Peter and Strohmaier, Markus},
  booktitle = {Proceedings of the 21st ACM Conference on Hypertext and Hypermedia},
  doi = {10.1145/1810617.1810645},
  interhash = {ccca64b638181c35972c71e586ddc0c2},
  intrahash = {87e3f9fa38eed6342454dcf47bb3e575},
  isbn = {978-1-4503-0041-4},
  location = {Toronto, Ontario, Canada},
  numpages = {10},
  pages = {157--166},
  publisher = {ACM},
  series = {HT '10},
  title = {Of Categorizers and Describers: An Evaluation of Quantitative Measures for Tagging Motivation},
  url = {http://doi.acm.org/10.1145/1810617.1810645},
  year = 2010
}

@inproceedings{doerfel2013analysis,
  abstract = {Since the rise of collaborative tagging systems on the web, the tag recommendation task -- suggesting suitable tags to users of such systems while they add resources to their collection -- has been tackled. However, the (offline) evaluation of tag recommendation algorithms usually suffers from difficulties like the sparseness of the data or the cold start problem for new resources or users. Previous studies therefore often used so-called post-cores (specific subsets of the original datasets) for their experiments. In this paper, we conduct a large-scale experiment in which we analyze different tag recommendation algorithms on different cores of three real-world datasets. We show, that a recommender's performance depends on the particular core and explore correlations between performances on different cores.},
  acmid = {2507222},
  address = {New York, NY, USA},
  author = {Doerfel, Stephan and Jäschke, Robert},
  booktitle = {Proceedings of the 7th ACM conference on Recommender systems},
  doi = {10.1145/2507157.2507222},
  interhash = {3eaf2beb1cdad39b7c5735a82c3338dd},
  intrahash = {a73213a865503252caa4b28e88a77108},
  isbn = {978-1-4503-2409-0},
  location = {Hong Kong, China},
  numpages = {4},
  pages = {343--346},
  publisher = {ACM},
  series = {RecSys '13},
  title = {An Analysis of Tag-Recommender Evaluation Procedures},
  url = {http://doi.acm.org/10.1145/2507157.2507222},
  year = 2013
}

@inproceedings{doerfel2013analysis,
  abstract = {Since the rise of collaborative tagging systems on the web, the tag recommendation task -- suggesting suitable tags to users of such systems while they add resources to their collection -- has been tackled. However, the (offline) evaluation of tag recommendation algorithms usually suffers from difficulties like the sparseness of the data or the cold start problem for new resources or users. Previous studies therefore often used so-called post-cores (specific subsets of the original datasets) for their experiments. In this paper, we conduct a large-scale experiment in which we analyze different tag recommendation algorithms on different cores of three real-world datasets. We show, that a recommender's performance depends on the particular core and explore correlations between performances on different cores.},
  acmid = {2507222},
  address = {New York, NY, USA},
  author = {Doerfel, Stephan and Jäschke, Robert},
  booktitle = {Proceedings of the 7th ACM conference on Recommender systems},
  doi = {10.1145/2507157.2507222},
  interhash = {3eaf2beb1cdad39b7c5735a82c3338dd},
  intrahash = {aa4b3d79a362d7415aaa77625b590dfa},
  isbn = {978-1-4503-2409-0},
  location = {Hong Kong, China},
  numpages = {4},
  pages = {343--346},
  publisher = {ACM},
  series = {RecSys '13},
  title = {An analysis of tag-recommender evaluation procedures},
  url = {https://www.kde.cs.uni-kassel.de/pub/pdf/doerfel2013analysis.pdf},
  year = 2013
}

@inproceedings{mitzlaff2011community,
  abstract = {Community mining is a prominent approach for identifying (user) communities in social and ubiquitous contexts. While there are a variety of methods for community mining and detection, the effective evaluation and validation of the mined communities is usually non-trivial. Often there is no evaluation data at hand in order to validate the discovered groups.},
  author = {Mitzlaff, Folke and Atzmueller, Martin and Benz, Dominik and Hotho, Andreas and Stumme, Gerd},
  booktitle = {Analysis of Social Media and Ubiquitous Data},
  doi = {10.1007/978-3-642-23599-3_5},
  editor = {Atzmueller, Martin and Hotho, Andreas and Strohmaier, Markus and Chin, Alvin},
  interhash = {1ef065a81ed836dfd31fcc4cd4da133b},
  intrahash = {6f0d819fd09357e11ef074c242f824a6},
  isbn = {978-3-642-23598-6},
  pages = {79-98},
  publisher = {Springer Berlin Heidelberg},
  series = {Lecture Notes in Computer Science},
  title = {Community Assessment Using Evidence Networks},
  url = {http://dx.doi.org/10.1007/978-3-642-23599-3_5},
  volume = 6904,
  year = 2011
}

@inproceedings{doerfel2013analysis,
  abstract = {Since the rise of collaborative tagging systems on the web, the tag recommendation task -- suggesting suitable tags to users of such systems while they add resources to their collection -- has been tackled. However, the (offline) evaluation of tag recommendation algorithms usually suffers from difficulties like the sparseness of the data or the cold start problem for new resources or users. Previous studies therefore often used so-called post-cores (specific subsets of the original datasets) for their experiments. In this paper, we conduct a large-scale experiment in which we analyze different tag recommendation algorithms on different cores of three real-world datasets. We show, that a recommender's performance depends on the particular core and explore correlations between performances on different cores.},
  acmid = {2507222},
  address = {New York, NY, USA},
  author = {Doerfel, Stephan and Jäschke, Robert},
  booktitle = {Proceedings of the 7th ACM conference on Recommender systems},
  doi = {10.1145/2507157.2507222},
  interhash = {3eaf2beb1cdad39b7c5735a82c3338dd},
  intrahash = {a73213a865503252caa4b28e88a77108},
  isbn = {978-1-4503-2409-0},
  location = {Hong Kong, China},
  numpages = {4},
  pages = {343--346},
  publisher = {ACM},
  series = {RecSys '13},
  title = {An Analysis of Tag-Recommender Evaluation Procedures},
  url = {http://doi.acm.org/10.1145/2507157.2507222},
  year = 2013
}

@preprint{beel2013research,
  author = {Beel, Joeran and Langer, Stefan and Genzmehr, Marcel and Gipp, Bela and Breitinger, Corinna and Nürnberger, Andreas},
  interhash = {544758b1fd737c010643f529c4f48ae6},
  intrahash = {4afa2bd342dda6b6d32713aa0fbc33bd},
  title = {Research Paper Recommender System Evaluation: A Quantitative Literature Survey},
  year = 2013
}

@article{strohmaier2011evaluation,
  author = {Strohmaier, Markus and Helic, Denis and Benz, Dominik and Körner, Christian and Kern, Roman},
  interhash = {87e110b0ade230877db6855cacabcb4d},
  intrahash = {603161eb4c5b2f87f3d3a50f87015337},
  journal = {Transactions on Intelligent Systems and Technology},
  title = {Evaluation of Folksonomy Induction Algorithms},
  url = {http://tist.acm.org/index.html},
  vgwort = {43},
  year = 2012
}

@article{thelwall2012journal,
  abstract = {In theory, the web has the potential to provide information about the wider impact of academic research, beyond traditional scholarly impact. This is because the web can reflect non-scholarly uses of research, such as in online government documents, press coverage or public discussions. Nevertheless, there are practical problems with creating metrics for journals based on web data: principally that most such metrics should be easy for journal editors or publishers to manipulate. Nevertheless, two alternatives seem to have both promise and value: citations derived from digitised books and download counts for journals within specific delivery platforms.},
  author = {Thelwall, Mike},
  doi = {10.1007/s11192-012-0669-x},
  interhash = {834707cf0663109f7811a14ae746be72},
  intrahash = {284883bbaa636a0bab13fc54b903f363},
  issn = {0138-9130},
  journal = {Scientometrics},
  language = {English},
  number = 2,
  pages = {429--441},
  publisher = {Springer Netherlands},
  title = {Journal impact evaluation: a webometric perspective},
  url = {http://dx.doi.org/10.1007/s11192-012-0669-x},
  volume = 92,
  year = 2012
}

@article{thelwall2012journal,
  abstract = {In theory, the web has the potential to provide information about the wider impact of academic research, beyond traditional scholarly impact. This is because the web can reflect non-scholarly uses of research, such as in online government documents, press coverage or public discussions. Nevertheless, there are practical problems with creating metrics for journals based on web data: principally that most such metrics should be easy for journal editors or publishers to manipulate. Nevertheless, two alternatives seem to have both promise and value: citations derived from digitised books and download counts for journals within specific delivery platforms.},
  author = {Thelwall, Mike},
  doi = {10.1007/s11192-012-0669-x},
  interhash = {834707cf0663109f7811a14ae746be72},
  intrahash = {284883bbaa636a0bab13fc54b903f363},
  issn = {0138-9130},
  journal = {Scientometrics},
  language = {English},
  number = 2,
  pages = {429-441},
  publisher = {Springer Netherlands},
  title = {Journal impact evaluation: a webometric perspective},
  url = {http://dx.doi.org/10.1007/s11192-012-0669-x},
  volume = 92,
  year = 2012
}

@inproceedings{dominguezgarcia2012freset,
  abstract = {FReSET is a new recommender systems evaluation framework aiming to support research on folksonomy-based recommender systems. It provides interfaces for the implementation of folksonomy-based recommender systems and supports the consistent and reproducible offline evaluations on historical data. Unlike other recommender systems framework projects, the emphasis here is on providing a flexible framework allowing users to implement their own folksonomy-based recommender algorithms and pre-processing filtering methods rather than just providing a collection of collaborative filtering implementations. FReSET includes a graphical interface for result visualization and different cross-validation implementations to complement the basic functionality.},
  acmid = {2365939},
  address = {New York, NY, USA},
  author = {Dom\'{\i}nguez Garc\'{\i}a, Renato and Bender, Matthias and Anjorin, Mojisola and Rensing, Christoph and Steinmetz, Ralf},
  booktitle = {Proceedings of the 4th ACM RecSys workshop on Recommender systems and the social web},
  doi = {10.1145/2365934.2365939},
  interhash = {489207308b5d7f064163652763794ce6},
  intrahash = {c78b033eb1b463ff00c4fc67ed8bf679},
  isbn = {978-1-4503-1638-5},
  location = {Dublin, Ireland},
  numpages = {4},
  pages = {25--28},
  publisher = {ACM},
  series = {RSWeb '12},
  title = {FReSET: an evaluation framework for folksonomy-based recommender systems},
  url = {http://doi.acm.org/10.1145/2365934.2365939},
  year = 2012
}

@inproceedings{parra2009evaluation,
  abstract = {Motivated by the potential use of collaborative tagging systems to develop new recommender systems, we have implemented and compared three variants of user-based collaborative filtering algorithms to provide recommendations of articles on CiteULike. On our first approach, Classic Collaborative filtering (CCF), we use Pearson correlation to calculate similarity between users and a classic adjusted ratings formula to rank the recommendations. Our second approach, Neighbor-weighted Collaborative Filtering (NwCF), incorporates the amount of raters in the ranking formula of the recommendations. A modified version of the Okapi BM25 IR model over users ’ tags is implemented on our third approach to form the user neighborhood. Our results suggest that incorporating the number of raters into the algorithms leads to an improvement of precision, and they also support that tags can be considered as an alternative to Pearson correlation to calculate the similarity between users and their neighbors in a collaborative tagging system. },
  author = {Parra, Denis and Brusilovsky, Peter},
  booktitle = {Proceedings of the Workshop on Web 3.0: Merging Semantic Web and Social Web},
  interhash = {03a51e24ecab3ad66fcc381980144fea},
  intrahash = {42773258c36ccf2f59749991518d1784},
  issn = {1613-0073},
  location = {Torino, Italy},
  month = jun,
  series = {CEUR Workshop Proceedings},
  title = {Evaluation of Collaborative Filtering Algorithms for Recommending Articles on CiteULike},
  url = {http://ceur-ws.org/Vol-467/paper5.pdf},
  volume = 467,
  year = 2009
}

@article{shani2011evaluating,
  author = {Shani, G. and Gunawardana, A.},
  interhash = {c93599e113544cde3f44502c88775c20},
  intrahash = {63a1a401a35be851b9864966184c6815},
  journal = {Recommender Systems Handbook},
  pages = {257--297},
  publisher = {Springer},
  title = {Evaluating recommendation systems},
  url = {http://scholar.google.de/scholar.bib?q=info:AW2lmZl44hMJ:scholar.google.com/&output=citation&hl=de&as_sdt=0,5&ct=citation&cd=0},
  year = 2011
}

@inproceedings{joachims2005accurately,
  abstract = {This paper examines the reliability of implicit feedback generated from clickthrough data in WWW search. Analyzing the users' decision process using eyetracking and comparing implicit feedback against manual relevance judgments, we conclude that clicks are informative but biased. While this makes the interpretation of clicks as absolute relevance judgments difficult, we show that relative preferences derived from clicks are reasonably accurate on average.},
  acmid = {1076063},
  address = {New York, NY, USA},
  author = {Joachims, Thorsten and Granka, Laura and Pan, Bing and Hembrooke, Helene and Gay, Geri},
  booktitle = {Proceedings of the 28th annual international ACM SIGIR conference on Research and development in information retrieval},
  doi = {10.1145/1076034.1076063},
  interhash = {050982b76855a6b1258ed0b40cb69018},
  intrahash = {8c488477626fa59db419ac77f3552029},
  isbn = {1-59593-034-5},
  location = {Salvador, Brazil},
  numpages = {8},
  pages = {154--161},
  publisher = {ACM},
  title = {Accurately interpreting clickthrough data as implicit feedback},
  url = {http://doi.acm.org/10.1145/1076034.1076063},
  year = 2005
}

@presentation{kohavi2012online,
  abstract = {The web provides an unprecedented opportunity to accelerate innovation by evaluating ideas quickly and accurately using controlled experiments (e.g., A/B tests and their generalizations).   Whether for front-end user-interface changes, or backend recommendation systems and relevance algorithms, online controlled experiments are now utilized to make data-driven decisions at Amazon, Microsoft, eBay, Facebook, Google, Yahoo, Zynga, and at many other companies.  While the theory of a controlled experiment is simple, and dates back to Sir Ronald A. Fisher’s experiments at the Rothamsted Agricultural Experimental Station in England in the 1920s, the deployment and mining of online controlled experiments at scale—thousands of experiments now—has taught us many lessons.  We provide an introduction, share real examples, key learnings, cultural challenges, and humbling statistics. },
  author = {Kohavi, Ron},
  day = 12,
  interhash = {36a473c449c5ede0589c2801781a0579},
  intrahash = {aa31e13651d5d1eab42e449e55a0e745},
  month = sep,
  title = {Online Controlled Experiments: Introduction, Learnings, and Humbling Statistics},
  type = {Industry keynote at ACM Recommender Systems},
  url = {http://www.exp-platform.com/Pages/2012RecSys.aspx},
  year = 2012
}

@article{alonso2008crowdsourcing,
  abstract = {Relevance evaluation is an essential part of the development and maintenance of information retrieval systems. Yet traditional evaluation approaches have several limitations; in particular, conducting new editorial evaluations of a search system can be very expensive. We describe a new approach to evaluation called TERC, based on the crowdsourcing paradigm, in which many online users, drawn from a large community, each performs a small evaluation task.},
  acmid = {1480508},
  address = {New York, NY, USA},
  author = {Alonso, Omar and Rose, Daniel E. and Stewart, Benjamin},
  doi = {10.1145/1480506.1480508},
  interhash = {8441d7fed92813634f61fa148ef2b870},
  intrahash = {4a47833e85558b740788607cb79ba795},
  issn = {0163-5840},
  issue_date = {December 2008},
  journal = {SIGIR Forum},
  month = nov,
  number = 2,
  numpages = {7},
  pages = {9--15},
  publisher = {ACM},
  title = {Crowdsourcing for relevance evaluation},
  url = {http://doi.acm.org/10.1145/1480506.1480508},
  volume = 42,
  year = 2008
}

@book{Memo849791,
  author = {..., Klaus Backhaus},
  booktitle = { Multivariate Analysemethoden },
  coverage = {VIII, 575 S.},
  edition = { 12., vollst. überarb. Aufl. },
  editor = {Backhaus, Klaus},
  interhash = {13a57c36ac5b340d21b4f9b4bba4ad50},
  intrahash = {d570bf164f1ab9f96e4afeac5875a07a},
  isbn = {978-3-540-85044-1},
  publisher = { Springer },
  publisherplace = {Berlin [u.a.]},
  title = { Multivariate Analysemethoden, eine anwendungsorientierte Einführung },
  year = { 2008 }
}

@article{Go_Huang_Bhayani_2009,
  author = {Go, A and Huang, L and Bhayani, R},
  interhash = {c462bf3fa792403429b46ec83efc2d06},
  intrahash = {21e712d455a36a1125bd9bfe6c9383a8},
  journal = {Entropy},
  number = {June},
  pages = 17,
  publisher = {Association for Computational Linguistics},
  title = {Sentiment Analysis of Twitter Data},
  url = {http://nlp.stanford.edu/courses/cs224n/2009/fp/3.pdf},
  volume = 2009,
  year = 2009
}