[docs] Adding some documentation for normalize.h options

2015-07-24 15:23:18 -04:00
parent caf714f06f
commit 5239c365d0
1 changed files with 20 additions and 0 deletions
--- a/src/normalize.h
+++ b/src/normalize.h
@@ -1,3 +1,23 @@
+/* normalize.h
+
+The normalize module provides several options for preprocessing full strings:
+
+- Unicode normalization (NFD/decomposition)
+- Transliteration (including Latin-ASCII)
+- Accent mark removal
+- UTF-8 lowercasing with utf8proc
+
+As well as normalizations for individual string tokens:
+
+- Replace hyphens with space e.g. "quatre-vignt" => "quatre vignt"
+- Delete hyphens e.g. "auto-estrada" => "autoestrada"
+- Delete final period "R." => "R"
+- Delete acronym periods: "U.S.A." => "USA"
+- Drop English possessive "Janelle's" => "Janelle"
+- Delete other apostrophes "O'Malley" => "OMalley" (not appropriate for Latin languages, use elision separation)
+
+*/
+
 #ifndef NORMALIZE_H
 #define NORMALIZE_H