#!/usr/bin/env python3 """ Converts chaos.html into JSON. A sample of the input:

Dearest creature in creation
Studying English pronunciation,
   I will teach you in my verse
   Sounds like corpse, corps, horse and worse.

A hand-formatted portion of the output (note that indentation, line breaks, order of dict entries, etc. don't matter as long as the data matches): [ ... {"stanza": 3, "lines": [ {"lineId": "3-1", "lineNum": 1, "text": "Pray, console your loving poet,", "tokens": ["Pray", ",", "console", "your", "loving", "poet"], "rhymeWords": ["poet"]}, {"lineId": "3-2", "lineNum": 2, "text": "Make my coat look new, dear, sew it!", "tokens": ["Make", "my", "coat", "look", "new", ",", "dear", ",", "sew", "it", "!"], "rhymeWords": ["sew", "it"]}, ... ]}, ... {"stanza": 9, "lines": [ {"lineId": "9-1", "lineNum": 1, "text": "From \"desire\": desirable - admirable from \"admire\",", "tokens": ["From", "``", "desire", "''", ":", "desirable", "-", "admirable", "from", "``", "admire", "''", ","], "rhymeWords": ["admire"]}, ... ]}, ... ] """ import json, re from nltk import word_tokenize def hasalpha(token): return # TODO: whether any character in the token is a letter # regex that breaks an HTML line into parts: line number within the stanza, main portion, spacing LINE_RE = # TODO: # TODO: read from chaos.html, construct data structure, write to chaos.json