@InProceedings{SchaeferEa2013,
  Title                    = {The Good, the Bad, and the Hazy: Design Decisions in Web Corpus Construction},
  Author                   = {Roland Schäfer and Adrien Barbaresi and Felix Bildhauer},
  Booktitle                = {Proceedings of the 8th Web as Corpus Workshop (WAC-8)},
  Year                     = {2013},

  Address                  = {Lancaster},
  Editor                   = {Stefan Evert and Egon Stemle and Paul Rayson},
  Pages                    = {7--15},
  Publisher                = {SIGWAC},

  Abstract                 = {In this paper, we examine notions of text quality in the context of web corpus construction. Web documents often contain material which disqualifies them from inclusion in a corpus (tag clouds, lists of names or nouns, etc.). First, we look at the agreement between coders (especially corpus designers) given the task of rating text quality. Then, we evaluate a simple and fully unsupervised method of text quality assessment based on short and very frequent words. Finally, we describe our general approach to the construction of carefully cleansed and non-destructively normalized web corpora. Under this approach, we annotate documents with quality metrics instead of actually removing those documents classified as being of low quality.},
  Url                      = {http://rolandschaefer.net/?p=78}
}

