russiangram_parser.py (736B)
1 import codecs 2 from html.parser import HTMLParser 3 4 class RussiangramParser(HTMLParser): 5 def __init__(self): 6 self.intextarea = False 7 self.data = '' 8 super().__init__() 9 10 def handle_starttag(self, tag, attrs): 11 attrs = dict(attrs) 12 if tag == "textarea" and attrs['id'] == "MainContent_UserSentenceTextbox": 13 self.intextarea = True 14 15 def handle_endtag(self, tag): 16 if tag == "textarea" and self.intextarea: 17 self.intextarea = False 18 19 def handle_data(self, data): 20 if self.intextarea: 21 bstrings = filter(lambda y: '\\' not in y, data.split('\\x')) 22 bs = bytes(map(lambda x: int(x, 16), bstrings)) 23 self.data = bs.decode()