#!/usr/bin/env python3

import json, re
from collections import defaultdict
from nltk import word_tokenize
from nltk.corpus import cmudict # need to have downloaded the data through NLTK

""" Sample part of the output (hand-formatted):

{"lineId": "3-2", "lineNum": 2,
 "text": "Make my coat look new, dear, sew it!",
 "tokens": ["Make", "my", "coat", "look", "new", ",", "dear", ",", "sew", "it", "!"],
 "rhymeWords": ["sew", "it"],
 "rhymeProns": [["S OW1"], ["IH1 T", "IH0 T"]]
},
"""

# Load the cmudict entries into a data structure.
# Store each pronunciation as a STRING of phonemes (separated by spaces).
...

# Load chaos.json
...

# For each line of the poem, add a "rhymeProns" entry
# which is a list parallel with "rhymeWords".
# For each word, it contains a list of possible pronunciations.
...

# Write the enhanced data to chaos.pron.json
...

"""
TODO: Answer the question:

- How many rhyme words are NOT found in cmudict (they are "out-of-vocabulary", or "OOV")?
Give some examples.

...
"""