Commit 3f421a4b authored by Felix Eckhofer's avatar Felix Eckhofer 🤹🏼
Browse files

add dictionary data and generate_* scripts

parent 3d1844a0
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
#!/bin/sh
RAW="`mktemp`"
OUT="`mktemp`"
curl -f -s "http://de.wikipedia.org/w/index.php?title=Liste_von_Automobilmarken&action=raw" > "$RAW"
grep '^|\[\[' "$RAW" | # brands with links
sed -r 's#^\|\[\[##; # remove start of line "|[["
s#[^]]+\|##; # remove link text if existing
s#\]\].*## # remove end of line "]] ..."
' >> "$OUT"
grep -E '^\|[^[|]+\|\|' "$RAW" | # brands without links
sed -r 's#^\|##; # remove start of line "|"
s#\|.*## # remove end of line "| ..."
' >> "$OUT"
grep -E "^\|data-sort-value[^|]+\|[^']" "$RAW" | # brands with special characters
sed -r 's#^\|[a-z-]+="##; # remove start of line "|data-sort-value=\""
s#".*##; # remove end of line "\" ..."
s# &$##; # remove trailing &
' >> "$OUT"
tr 'a-zöäü' 'A-ZÖÄÜ' < "$OUT" | sort -u
rm -f "$RAW" "$OUT"
#!/bin/sh
#
# needs xls2csv from catdoc
# data from https://www.destatis.de/DE/ZahlenFakten/LaenderRegionen/Regionales/Gemeindeverzeichnis/Administrativ/AdministrativeUebersicht.html ("Politisch selbstständige Gemeinden")
xls2csv -q3 data/AuszugGV2QAktuell.xls |
sed 's#^,#"",#;s#,,#,"",#g;s#,,#,"",#g;s#,$#,""#' | # escape empty fields (for awk)
awk -F"\",\"" '{ print $7","$8 }' | # filter gemeinde-nr, gemeinde-name
grep -E "^[0-9]+," | # only show entries with gemeinde-nr
sed -r 's#[^,]+,##;s#,[^,]+$## # remove gemeinde-nr, gemeinde-typ
s# (am|vor|im|bei|auf|in|an) .*##i; # remove common suffixes
s# [aibv]\..*##i; # remove common suffixes (abbrev)
s# [(].*##; s#/.*##' | # remove suffixes
tr 'a-zöäü' 'A-ZÖÄÜ' |
sort -u
#!/bin/sh
curl -f -s http://ftp.tu-chemnitz.de/pub/Local/urz/ding/de-en/de-en.txt.gz |
gzip -d | # uncompress
sed -r 's#::.*##; # delete translation
/^#.*/d; # delete comments
s/ ?[;|] ?/\
/g; # split alternative forms into seperate lines
s# ?[([{].*##; # delete annotations and metadata
/ [^ ]/d; # delete composite phrases
/\.\.\.$/d; # delete word-parts
s# $##; # delete trailing white space
' |
tr 'a-zöäü' 'A-ZÖÄÜ' |
sort -u
#!/bin/sh
curl -f -s "http://de.wikipedia.org/w/index.php?title=Liste_der_Kfz-Kennzeichen_in_Deutschland&action=raw" |
grep -E "'''[A-Z0-9]+'''" | # hopefully matches only names of animals :)
sed -r "s#[^']+'''##;
s#'''.*##" |
tr 'a-zöäü' 'A-ZÖÄÜ' |
sort -u
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment