###
### we will use this to get text from the web
###
import urllib.request
import json, yaml, toml

def text_from_url(url):
    try:
        response = urllib.request.urlopen(url)
    except urllib.error.HTTPError:
        print(f"I couldn't find: {url}")
    else:
        return response.read().decode('utf-8')

### read from Gutenberg

raw = text_from_url("https://www.gutenberg.org/cache/epub/29019/pg29019.txt")

## save

with open("acag-raw.txt", 'w') as f:
    f.write(raw)

## read

with open("acag-raw.txt") as f:
    raw = f.read()

### save just the story

story = raw[1650: -19030]

with open("acag.txt", 'w') as f:
    f.write(story)

stats = dict()

stats['Characters'] = len(story)
stats['Lines'] = len(story.splitlines())
### approximations
stats['Words'] = len(story.split()) 
stats['Sentences'] = story.count('.') + story.count('!') + story.count('?')  

with open('stats.yaml', 'w') as outfile:
    yaml.dump(stats, outfile)

with open('stats.json', 'w') as outfile:
    json.dump(stats, outfile)

with open('stats.toml', 'w') as outfile:
    toml.dump(stats, outfile)

raw = text_from_url("https://www.gutenberg.org/cache/epub/29019/pg29019a.txt")

def strip_gutenberg(raw):
    """
    Strip off everything before the line '*** START'  ..., ' ***'
    and after the line '*** END OF THE PROJECT GUTENBERG EBOOK', ...
    """
    ### go to the start of the start message
    start = raw.index('*** START')
    raw = raw[start:]
    ### go to the end of the start message
    start = raw.index (' ***')
    raw = raw[start + 5:]

    ### go to the start of the end message
    end = raw.index('*** END OF THE PROJECT GUTENBERG EBOOK')
    return raw[:end].strip()

raw = text_from_url("https://www.gutenberg.org/cache/epub/2852/pg2852.txt")

cooked = strip_gutenberg(raw)

print('\n*** The beginning\n')
print(cooked[:200])

print('\n*** The end\n')
print(cooked[-200:])