#!/usr/bin/env python3 import json, re from collections import defaultdict from nltk import word_tokenize from nltk.corpus import cmudict # need to have downloaded the data through NLTK """ Sample part of the output (hand-formatted): {"lineId": "3-2", "lineNum": 2, "text": "Make my coat look new, dear, sew it!", "tokens": ["Make", "my", "coat", "look", "new", ",", "dear", ",", "sew", "it", "!"], "rhymeWords": ["sew", "it"], "rhymeProns": [["S OW1"], ["IH1 T", "IH0 T"]] }, """ # Load the cmudict entries into a data structure. # Store each pronunciation as a STRING of phonemes (separated by spaces). ... # Load chaos.json ... # For each line of the poem, add a "rhymeProns" entry # which is a list parallel with "rhymeWords". # For each word, it contains a list of possible pronunciations. ... # Write the enhanced data to chaos.pron.json ... """ TODO: Answer the question: - How many rhyme words are NOT found in cmudict (they are "out-of-vocabulary", or "OOV")? Give some examples. ... """