@inproceedings{c5aff97024e74817ac32c614466c034a,
title = "Scalable k-NN based text clustering",
abstract = "Clustering items using textual features is an important problem with many applications, such as root-cause analysis of spam campaigns, as well as identifying common topics in social media. Due to the sheer size of such data, algorithmic scalability becomes a major concern. In this work, we present our approach for text clustering that builds an approximate k-NN graph, which is then used to compute connected components representing clusters. Our focus is to understand the scalability / accuracy tradeoff that underlies our method: we do so through an extensive experimental campaign, where we use real-life datasets, and show that even rough approximations of k-NN graphs are sufficient to identify valid clusters. Our method is scalable and can be easily tuned to meet requirements stemming from different application domains.",
author = "Alessandro Lulli and Thibault Debatty and Matteo Dell'Amico and Pietro Michiardi and Laura Ricci",
note = "Publisher Copyright: {\textcopyright} 2015 IEEE.; 3rd IEEE International Conference on Big Data, IEEE Big Data 2015 ; Conference date: 29-10-2015 Through 01-11-2015",
year = "2015",
month = dec,
day = "22",
doi = "10.1109/BigData.2015.7363845",
language = "English",
series = "Proceedings - 2015 IEEE International Conference on Big Data, IEEE Big Data 2015",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "958--963",
editor = "Feng Luo and Kemafor Ogan and Zaki, {Mohammed J.} and Laura Haas and Ooi, {Beng Chin} and Vipin Kumar and Sudarsan Rachuri and Saumyadipta Pyne and Howard Ho and Xiaohua Hu and Shipeng Yu and Hsiao, {Morris Hui-I} and Jian Li",
booktitle = "Proceedings - 2015 IEEE International Conference on Big Data, IEEE Big Data 2015",
}