@Article{BiemannEa2013,
  Title                    = {Scalable Construction of High-Quality Web Corpora},
  Author                   = {Biemann, Chris and Bildhauer, Felix and Evert, Stefan and Goldhahn, Dirk and Quasthoff, Uwe and Schäfer, Roland and Simon, Johannes and Swiezinski, Leonard and Zesch, Torsten},
  Journal                  = {Journal for Language Technology and Computational Linguistics},
  Year                     = {2013},
  Number                   = {2},
  Pages                    = {23--60},
  Volume                   = {28},

  Abstract                 = {In this article, we give an overview about the necessary steps to construct high-quality corpora from web texts. We first focus on web crawling and the pros and cons of the existing crawling strategies. Then, we describe how the crawled data can be linguistically pre-processed in a parallelized way that allows the processing of web-scale input data. As we are working with web data, controlling the quality of the resulting corpus is an important issue, which we address by showing how corpus statistics and a linguistic evaluation can be used to assess the quality of corpora. Finally, we show how the availability of extremely large, high-quality corpora opens up new directions for research in various fields of linguistics, computational linguistics, and natural language processing.}
}

