#!/usr/bin/env python3

"""
Converts chaos.html into JSON. A sample of the input:

<xxx1><p>Dearest <i>creature</i> in <i>creation</i><br>
<xxx2>Studying English <i>pronunciation</i>,<br>
<xxx3><tt>&nbsp;&nbsp;&nbsp;</tt>I will teach you in my <i>verse</i><br>
<xxx4><tt>&nbsp;&nbsp;&nbsp;</tt>Sounds like <i>corpse</i>, <i>corps</i>, <i>horse</i> and <i>worse</i>.</p>

A hand-formatted portion of the output (note that indentation, line breaks,
order of dict entries, etc. don't matter as long as the data matches):

[
    ...
    {"stanza": 3,
     "lines": [
          {"lineId": "3-1", "lineNum": 1, "text": "Pray, console your loving poet,",
           "tokens": ["Pray", ",", "console", "your", "loving", "poet"],
           "rhymeWords": ["poet"]},
          {"lineId": "3-2", "lineNum": 2, "text": "Make my coat look new, dear, sew it!",
           "tokens": ["Make", "my", "coat", "look", "new", ",", "dear", ",", "sew", "it", "!"],
           "rhymeWords": ["sew", "it"]},
          ...
     ]},
    ...
    {"stanza": 9,
     "lines": [
          {"lineId": "9-1", "lineNum": 1, "text": "From \"desire\": desirable - admirable from \"admire\",",
           "tokens": ["From", "``", "desire", "''", ":", "desirable", "-", "admirable", "from", "``", "admire", "''", ","],
           "rhymeWords": ["admire"]},
          ...
     ]},
     ...
]
"""

import json, re
from nltk import word_tokenize

def hasalpha(token):
    return # TODO: whether any character in the token is a letter

# regex that breaks an HTML line into parts: line number within the stanza, main portion, spacing
LINE_RE = # TODO:

# TODO: read from chaos.html, construct data structure, write to chaos.json