@InProceedings{Schaefer2016a,
  Title                    = {CommonCOW: Massively Huge Web Corpora from CommonCrawl Data and a Method to Distribute them Freely under Restrictive EU Copyright Laws},
  Author                   = {Roland Schäfer},
  Booktitle                = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)},
  Year                     = {2016},

  Address                  = {Portorož, Slovenia},
  Editor                   = {Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Marko Grobelnik and Bente Maegaard and Joseph Mariani and Asuncion Moreno and Jan Odijk and Stelios Piperidis},
  Pages                    = {4500--4504},
  Publisher                = {European Language Resources Association (ELRA)},

  Abstract                 = {In this paper, I describe a method of creating massively huge web corpora from the CommonCrawl data sets and redistributing the resulting annotations in a stand-off format. Current EU (and especially German) copyright legislation categorically forbids the redistribution of downloaded material without express prior permission by the authors. Therefore, stand-off annotations or other derivates are the only format in which European researchers (like myself) are allowed to re-distribute the respective corpora. In order to make the full corpora available to the public despite such restrictions, the stand-off format presented here allows anybody to locally reconstruct the full corpora with the least possible computational effort.},
  Date                     = {23-28},
  ISBN                     = {978-2-9517408-9-1},
  Language                 = {english}
}

