### Tokenization
import spacy
import spacy_udpipe

###
### Download models spacy has
###

spacy_models = {'zh':"zh_core_web_sm",
                "ja": "ja_core_news_sm",
                "ko": "ko_core_news_sm",
                "en": "en_core_web_sm"}
# download models
for model in spacy_models.values():
  spacy.cli.download(model)


nlp=dict()
for lang, model in spacy_models.items():
  nlp[lang]= spacy.load(model)

###
### Download models from upipe
###

udpipe_models = ['cs', 'id', 'vi']
for lang in udpipe_models:
  spacy_udpipe.download(lang) # download model

for lang in udpipe_models:
  print(f'loading {lang}')
  nlp[lang] = spacy_udpipe.load(lang)

  
import nltk
nltk.download('udhr2')
### I printed out the files and then searched by hand!
print (nltk.corpus.udhr.fileids())
udhr_lang = {
    'en':'English-Latin1',
    'cs':'Czech-UTF8',
    'id':'Indonesian-Latin1',
    'zh':'Chinese_Mandarin-GB2312',
    'ko':'Korean_Hankuko-UTF8',
    'ja':'Japanese_Nihongo-UTF8',
    'vi':'Vietnamese-UTF8'
  }

stats = dict()

for lang, fileid in udhr_lang.items():
  lstats = dict()
  declaration = nltk.corpus.udhr.raw(fileid)
  doc = nlp[lang](declaration)
  lstats['sents'] = len(list(doc.sents))
  ### count the POS
  for t in doc:
    if t.pos_ in lstats:
      lstats[t.pos_].append(t.pos_)
    else:
       lstats[t.pos_] = []
  stats[lang] = lstats

UPOS = "ADJ ADV NOUN VERB PROPN INTJ ADP AUX CCONJ SCONJ DET NUM PART PRON PUNCT SYM X".split()

print()
print('Crosslingual Comparison')
print()
print('POS',  end='\t')
print('\t'.join(stats.keys()))
for thing in ['sents']  + UPOS:
  print(thing, end='\t')
  for lang in stats:
    if thing == 'sents':
      print(stats[lang][thing],  end='\t')
    else:
      if thing in stats[lang]:
        print(len(stats[lang][thing]), end='\t')
      else:
        print(0,  end='\t')
  print()