### Tokenization
import spacy
import spacy_udpipe

###
### Download models spacy has
###

spacy_models = {'zh':"zh_core_web_sm",
                "ja": "ja_core_news_sm",
                "ko": "ko_core_news_sm",
                "en": "en_core_web_sm"}
# download models
for model in spacy_models.values():
  spacy.cli.download(model)


nlp=dict()
for lang, model in spacy_models.items():
  nlp[lang]= spacy.load(model)

###
### Download models from upipe
###

udpipe_models = ['cs', 'id']
for lang in udpipe_models:
  spacy_udpipe.download(lang) # download model

for lang in udpipe_models:
  print(f'loading {lang}')
  nlp[lang] = spacy_udpipe.load(lang)

testdata = {
    "zh": "有时，敏捷的棕色狐狸跳过了懒惰的猫。多么令人兴奋啊！",          # Chinese
    "cs": "Někdy, rychlá hnědá liška přeskočí línou kočku. Jak vzrušující!",  # Czech
    "ja": "時々、素早い茶色の狐が怠けた猫を飛び越える。なんて素晴らしい！",    # Japanese
    "ko": "때때로, 빠른 갈색 여우가 게으른 고양이를 뛰어넘는다. 정말 신난다!",      # Korean
    "id": "Kadang-kadang, rubah cokelat cepat melompati kucing malas. Betapa seru!", # Indonesian
    "en": "Sometimes, the quick brown fox jumps over the lazy cat. How exciting!"  # English
}

#from spacy import displacy

for lang, sample in testdata.items():
  print(f"\nLanguage: {lang.upper()}")
  doc = nlp[lang](sample)
  for sent in doc.sents:
    #displacy.render(sent)
    print ('---')
    for token in sent:
      print(token.text, token.pos_,token.lemma_, token.morph, sep='\t')