@inproceedings{bullock2011privacyaware, abstract = {With the increased popularity of Web 2.0 services in the last years data privacy has become a major concern for users. The more personal data users reveal, the more difficult it becomes to control its disclosure in the web. However, for Web 2.0 service providers, the data provided by users is a valuable source for offering effective, personalised data mining services. One major application is the detection of spam in social bookmarking systems: in order to prevent a decrease of content quality, providers need to distinguish spammers and exclude them from the system. They thereby experience a conflict of interests: on the one hand, they need to identify spammers based on the information they collect about users, on the other hand, they need to respect privacy concerns and process as few personal data as possible. It would therefore be of tremendous help for system developers and users to know which personal data are needed for spam detection and which can be ignored. In this paper we address these questions by presenting a data privacy aware feature engineering approach. It consists of the design of features for spam classification which are evaluated according to both, performance and privacy conditions. Experiments using data from the social bookmarking system BibSonomy show that both conditions must not exclude each other.}, acmid = {2024306}, address = {New York, NY, USA}, articleno = {15}, author = {Bullock, Beate Navarro and Lerch, Hana and Ro\ssnagel, Alexander and Hotho, Andreas and Stumme, Gerd}, booktitle = {Proceedings of the 11th International Conference on Knowledge Management and Knowledge Technologies}, doi = {10.1145/2024288.2024306}, interhash = {7a2d6a35c124ea0fe31c962f8f150916}, intrahash = {00a8f31185a34957eb16d500d7d51398}, isbn = {978-1-4503-0732-1}, location = {Graz, Austria}, numpages = {8}, pages = {15:1--15:8}, publisher = {ACM}, series = {i-KNOW '11}, title = {Privacy-aware spam detection in social bookmarking systems}, url = {http://doi.acm.org/10.1145/2024288.2024306}, year = 2011 } @article{kolari2006blog, author = {Kolari, P. and Java, A. and Finin, T. and Mayfield, J. and Joshi, A. and Martineau, J.}, interhash = {22f376a3a5e2ee890908d81f409fc08c}, intrahash = {e8d9c31822799d4d862a4bbcd885a4cf}, journal = {TREC 2006 Blog Track Notebook}, publisher = {Citeseer}, title = {{Blog track open task: Spam blog classification}}, url = {http://scholar.google.com/scholar.bib?q=info:BXvRJMPpbFUJ:scholar.google.com/&output=citation&hl=en&as_sdt=2000&as_vis=1&ct=citation&cd=10}, year = 2006 } @inproceedings{atze09, address = {Krakow, Poland}, author = {Atzmueller, Martin and Lemmerich, Florian and Krause, Beate and Hotho, Andreas}, booktitle = {7th Conference on Computer Methods and Systems}, interhash = {c226a55c0cc2dc6f261b86c09225c260}, intrahash = {014dbd07807e05a5ea9aafb2dbead39b}, month = {November}, note = {ISBN 83-916420-5-4}, title = {Who are the Spammers? Understandable Local Patterns for Concept Description}, url = {http://www.cms.agh.edu.pl/}, year = 2009 } @inproceedings{ALKH:09, author = {Atzmueller, Martin and Lemmerich, Florian and Krause, Beate and Hotho, Andreas}, booktitle = {Proc. LeGo-09: From Local Patterns to Global Models, Workshop at the 2009 European Conference on Machine Learning and Principles and Practice of Knowledge Discovery in Databases}, editor = {Knobbe, Johannes F\"urnkranz Arno}, interhash = {d27cd7eee4ab571ad3753a3d370141ce}, intrahash = {bb80bdcc06c8886968c453fd920dfe05}, note = {accepted}, title = {{Towards Understanding Spammers - Discovering Local Patterns for Concept Characterization and Description}}, url = {http://www.ke.tu-darmstadt.de/events/LeGo-09/04-Atzmueller.pdf}, year = 2009 } @article{journals/internet/HeymannKG07, author = {Heymann, Paul and Koutrika, Georgia and Garcia-Molina, Hector}, date = {2007-11-08}, ee = {http://doi.ieeecomputersociety.org/10.1109/MIC.2007.125}, interhash = {dea5faea536678622993617bfc5fbb85}, intrahash = {8a293527604e19085173fa461340f55e}, journal = {IEEE Internet Computing}, number = 6, pages = {36-45}, title = {Fighting Spam on Social Web Sites: A Survey of Approaches and Future Challenges.}, url = {http://dblp.uni-trier.de/db/journals/internet/internet11.html#HeymannKG07}, volume = 11, year = 2007 } @inproceedings{conf/airweb/MarkinesCM09, author = {Markines, Benjamin and Cattuto, Ciro and Menczer, Filippo}, booktitle = {AIRWeb}, crossref = {conf/airweb/2009}, date = {2009-05-06}, editor = {Fetterly, Dennis and Gyöngyi, Zoltán}, ee = {http://doi.acm.org/10.1145/1531914.1531924}, interhash = {50847302da776b6e04e53209a0b54699}, intrahash = {46e6041bcf0ba281cdcc3fc4cec6ae60}, isbn = {978-1-60558-438-6}, pages = {41-48}, series = {ACM International Conference Proceeding Series}, title = {Social spam detection.}, url = {http://dblp.uni-trier.de/db/conf/airweb/airweb2009.html#MarkinesCM09}, year = 2009 } @inproceedings{koutrika2007combating, address = {New York, NY, USA}, author = {Koutrika, Georgia and Effendi, Frans Adjie and Gy\"{o}ngyi, Zolt\'{a}n and Heymann, Paul and Garcia-Molina, Hector}, booktitle = {AIRWeb '07: Proceedings of the 3rd international workshop on Adversarial information retrieval on the web}, doi = {http://doi.acm.org/10.1145/1244408.1244420}, interhash = {8b6de1f035a46f5465f1ed868a18c79a}, intrahash = {776b76b33d469e438b0e5f74fc7ec7f0}, isbn = {978-1-59593-732-2}, location = {Banff, Alberta, Canada}, pages = {57--64}, publisher = {ACM Press}, title = {Combating spam in tagging systems}, url = {http://portal.acm.org/citation.cfm?id=1244408.1244420}, year = 2007 } @inproceedings{anti2008krause, address = {New York, NY, USA}, author = {Krause, Beate and Schmitz, Christoph and Hotho, Andreas and Stumme, Gerd}, booktitle = {AIRWeb '08: Proceedings of the 4th international workshop on Adversarial information retrieval on the web}, doi = {http://doi.acm.org/10.1145/1451983.1451998}, interhash = {a45d40ac7776551301ad9dde5b25357f}, intrahash = {68effe5d4b9460f9388e7685310f74c2}, isbn = {978-1-60558-159-0}, location = {Beijing, China}, pages = {61--68}, publisher = {ACM}, title = {The Anti-Social Tagger - Detecting Spam in Social Bookmarking Systems}, url = {http://airweb.cse.lehigh.edu/2008/submissions/krause_2008_anti_social_tagger.pdf}, year = 2008 } @inproceedings{conf/ceas/KongBRSR05, author = {Kong, Joseph S. and Boykin, P. Oscar and Rezaei, Behnam Attaran and Sarshar, Nima and Roychowdhury, Vwani P.}, booktitle = {CEAS}, crossref = {conf/ceas/2005}, date = {2006-06-01}, ee = {http://www.ceas.cc/papers-2005/143.pdf}, interhash = {4ff32863da5db6c23c744758b3ccdcbd}, intrahash = {3d408185c7554a612b00508d618f539c}, title = {Scalable and Reliable Collaborative Spam Filters: Harnessing the Global Social Email Networks.}, url = {http://dblp.uni-trier.de/db/conf/ceas/ceas2005.html#KongBRSR05}, year = 2005 } @misc{golbeck04reputation, author = {Golbeck, J. and Hendler, J.}, interhash = {72e1775df320df247001a588a474f5c2}, intrahash = {20c47d8ac8569f465a7d21e3aca0b73b}, text = {J. Golbeck and J. Hendler. Reputation Network Analysis for Email Filtering. In Proc. of the Conference on Email and Anti-Spam (CEAS), Mountain View, CA, USA, July 2004.}, title = {Reputation Network Analysis for Email Filtering}, url = {citeseer.ist.psu.edu/golbeck04reputation.html}, year = 2004 } @misc{gomes-2005, abstract = {Email is an increasingly important and ubiquitous means of communication, both facilitating contact between private individuals and enabling rises in the productivity of organizations. However the relentless rise of automatic unauthorized emails, a.k.a. spam is eroding away much of the attractiveness of email communication. Most of the attention dedicated to date to spam detection has focused on the content of the emails or on the addresses or domains associated with spam senders. Although methods based on these - easily changeable - identifiers work reasonably well they miss on the fundamental nature of spam as an opportunistic relationship, very different from the normal mutual relations between senders and recipients of legitimate email. Here we present a comprehensive graph theoretical analysis of email traffic that captures these properties quantitatively. We identify several simple metrics that serve both to distinguish between spam and legitimate email and to provide a statistical basis for models of spam traffic.}, author = {Gomes, Luiz H. and Almeida, Rodrigo B. and Bettencourt, Luis M. A. and Almeida, Virgilio and Almeida, Jussara M.}, interhash = {e20fec09f4faf2401c6a9dd0d654d0e9}, intrahash = {fff54b482dc6bbd160a270b0f494c149}, title = {Comparative Graph Theoretical Characterization of Networks of Spam and Legitimate Email}, url = {http://www.citebase.org/abstract?id=oai:arXiv.org:physics/0504025}, year = 2005 } @inproceedings{1166191, address = {New York, NY, USA}, author = {Hidalgo, José María Gómez and Bringas, Guillermo Cajigas and Sánz, Enrique Puertas and García, Francisco Carrero}, booktitle = {DocEng '06: Proceedings of the 2006 ACM symposium on Document engineering}, doi = {http://doi.acm.org/10.1145/1166160.1166191}, interhash = {6c439b2270b1bc2f5a3d6e0fa9eb1ae0}, intrahash = {45ac4e6a34bb50c2061a29e52079b576}, isbn = {1-59593-515-0}, location = {Amsterdam, The Netherlands}, pages = {107--114}, publisher = {ACM Press}, title = {Content based SMS spam filtering}, url = {http://portal.acm.org/ft_gateway.cfm?id=1166191&type=pdf&coll=&dl=acm&CFID=15151515&CFTOKEN=6184618}, year = 2006 } @techreport{284, author = {Gyongyi, Z. and Berkhin, P. and Garcia-Molina, H. and Pedersen, J.}, institution = {Stanford Univ.}, interhash = {987e3d3fd3c529a2662a5387bc568793}, intrahash = {4dda644faa9132ef2f09ac8a13f11d75}, title = {Link spam detection based on mass estimation}, url = {http://infolab.stanford.edu/~zoltan/publications/gyongyi2006link.pdf}, year = 2005 } @inproceedings{conf/ecml/DrostS05, author = {Drost, Isabel and Scheffer, Tobias}, booktitle = {ECML}, ee = {http://dx.doi.org/10.1007/11564096_14}, interhash = {813e48f0b04788c76ec4c9500cc5f8a0}, intrahash = {243303d890cd0f999d78b1a7e148e38c}, pages = {96-107}, title = {Thwarting the Nigritude Ultramarine: Learning to Identify Link Spam.}, url = {http://dblp.uni-trier.de/db/conf/ecml/ecml2005.html#DrostS05}, year = 2005 } @article{goodword2006lowd, address = {Palo Alto, CA}, author = {Lowd, Daniel and Meek, Christopher}, booktitle = {Second Conference on Email and Anti-Spam (CEAS)}, interhash = {c86d81bb31ea199c1d7aaf8b5e3e280d}, intrahash = {947e546ff2a77a7f099da4955fa73df2}, title = {Good Word Attacks on Statistical Spam Filters}, url = {http://www.cs.washington.edu/homes/lowd/ceas05lowd.pdf}, url1 = {http://www.cs.washington.edu/homes/lowd/ceas05lowd.ppt}, year = 2005 } @techreport{boykin04:_person_email_networ, author = {Boykin, P.O. and Roychowdhury, V.}, institution = {University of California, Los Angeles}, interhash = {3fe3b3135786db00897541e4dfdd3523}, intrahash = {770fc07ad9949bbafa5c3b08f196907b}, month = {February}, title = {{Personal Email Networks: An Effective Anti-Spam Tool}}, url = {http://arxiv.org/abs/cond-mat/0402143}, year = 2004 } @inproceedings{conf/vldb/GyongyiGP04, author = {Gyöngyi, Zoltán and Garcia-Molina, Hector and Pedersen, Jan}, booktitle = {VLDB}, ee = {http://www.vldb.org/conf/2004/RS15P3.PDF}, interhash = {df645e278ec1d72e4576f95ec7bb94c0}, intrahash = {0d15cc263e9ca534e79c2d6f470f725e}, pages = {576-587}, title = {Combating Web Spam with TrustRank.}, url = {http://dblp.uni-trier.de/db/conf/vldb/vldb2004.html#GyongyiGP04}, year = 2004 }