52 | | Usage: |
| 53 | === Sample output === |
| 54 | {{{ |
| 55 | <doc source="https://en.wikipedia.org/wiki/Dog" lang="english" |
| 56 | lang_scores="english: 49.56, czech: 19.86, slovak: 20.15"> |
| 57 | <par_langs lang="english" lang_scores="english: 49.56, czech: 19.86, slovak: 20.15"/> |
| 58 | <p> |
| 59 | #wordform English Czech Slovak score for each word |
| 60 | Linnaeus 0.00 0.00 0.00 #unknown to all sample wordlists |
| 61 | considered 5.18 0.00 0.00 #English only |
| 62 | the 7.82 5.26 5.33 #English word, ~100 x more frequent in the English wl |
| 63 | dog 4.89 0.00 0.00 |
| 64 | to 7.48 7.05 7.15 #a valid word in all three languages |
| 65 | be 6.77 0.00 0.00 |
| 66 | a 7.37 7.56 7.66 #a valid word in all three languages |
| 67 | separate 4.91 0.00 0.00 |
| 68 | species 5.14 0.00 0.00 |
| 69 | <g/> |
| 70 | . 0.00 0.00 0.00 #punctuation is omitted from wordlists |
| 71 | </p> |
| 72 | </doc> |
| 73 | }}} |
| 74 | |
| 75 | |
| 76 | == Installation == |
| 77 | {{{ |
| 78 | wget http://corpus.tools/raw-attachment/wiki/Downloads/wcwb_lang_filter_1.0.tar.gz |
| 79 | tar -czvf wcwb_lang_filter_1.0.tar.gz |
| 80 | cd wcwb_lang_filter_1.0 |
| 81 | make test/out.vert.lang_czech |
| 82 | }}} |
| 83 | |
| 84 | == Usage == |
| 128 | == To build your own frequency wordlist == |
| 129 | {{{ |
| 130 | #Get corpus frequencies of lowercased words from a corpus compiled by [https://nlp.fi.muni.cz/trac/noske Sketch Engine] |
| 131 | lsclex -f /corpora/registry/english_web_corpus lc | cut -f2,3 | ./uninorm_4.py | perl -pe 's, (\d+)$,\t$1,' > en.wl1 |
| 132 | lsclex -f /corpora/registry/czech_web_corpus lc | cut -f2,3 | ./uninorm_4.py | perl -pe 's, (\d+)$,\t$1,' > cs.wl1 |
| 133 | lsclex -f /corpora/registry/slovak_web_corpus lc | cut -f2,3 | ./uninorm_4.py | perl -pe 's, (\d+)$,\t$1,' > sk.wl1 |
| 134 | |
| 135 | #Or get the same from a vertical file |
| 136 | cut -f1 english_web_corpus.vert | grep -v '^<' | tr '[:upper:]' '[:lower:]' | sort | uniq -c | perl -pe 's,^\s*(\d+) (.*)$,$2\t$1,' > en.wl1 |
| 137 | cut -f1 czech_web_corpus.vert | grep -v '^<' | tr '[:upper:]' '[:lower:]' | sort | uniq -c | perl -pe 's,^\s*(\d+) (.*)$,$2\t$1,' > cs.wl1 |
| 138 | cut -f1 slovak_web_corpus.vert | grep -v '^<' | tr '[:upper:]' '[:lower:]' | sort | uniq -c | perl -pe 's,^\s*(\d+) (.*)$,$2\t$1,' > sk.wl1 |
| 139 | |
| 140 | #Filter the wordlist -- allow just characters valid for the language and a reasonable word length |
| 141 | grep '[abcdefghijklmnopqrstuvwxyz]' en.wl1 | grep -v -P "['.-]{2}" | ./wl_grep.py "[#@]?[abcdefghijklmnopqrstuvwxyzéè0-9'][abcdefghijklmnopqrstuvwxyzéè0-9'.-]{0,29}" > en.wl2 |
| 142 | grep '[aábcčdďeéěfghiíjklmnňoópqrřsštťuúůvwxyýzž]' cs.wl1 | grep -v -P "['.-]{2}" | ./wl_grep.py "[#@]?[aábcčdďeéěfghiíjklmnňoópqrřsštťuúůvwxyýzž0-9'][aábcčdďeéěfghiíjklmnňoópqrřsštťuúůvwxyýzž0-9'.-]{0,29}" > cs.wl2 |
| 143 | grep '[aáäbcčdďeéfghiíjklĺľmnňoóôpqrŕsštťuúvwxyýzž]' sk.wl1 | grep -v -P "['.-]{2}" | ./wl_grep.py "[#@]?[aáäbcčdďeéfghiíjklĺľmnňoóôpqrŕsštťuúvwxyýzž0-9'][aáäbcčdďeéfghiíjklĺľmnňoóôpqrŕsštťuúvwxyýzž0-9'.-]{0,29}" > sk.wl2 |
| 144 | |
| 145 | #Sort (not necessary) and pack |
| 146 | for f in {en,cs,sk}.wl2; do sort -k2,2rg -k1,1 ${c}.wl2 $f | gzip > ${f}.frqwl.gz; done |
| 147 | }}} |
| 148 | |