from copy import copy
from datetime import datetime
from bs4 import BeautifulSoup
import re
form = """
|
|
"""
entry1 = """
| Record Verification: |
April 2008 2017 |
"""
entry2 = ' December 2009 [Actual] | '
DROP_HILITE_re = re.compile('[\[\]\w]*\s?')
ADD_HILITE_re = re.compile('\w*\s?')
TAGS_RE = re.compile('<[=-_,.:;"/\w\s]+>')
def extract_new_data(td):
text = td.__str__()
return TAGS_RE.sub("",DROP_HILITE_re.sub(" ",text)).strip()
def extract_old_data(td):
text = td.__str__()
return TAGS_RE.sub("",ADD_HILITE_re.sub(" ",text)).strip()
def delete_tags(td):
text = td.__str__()
return TAGS_RE.sub(" ",text).strip()
def extract_date_and_tag(text, date_format):
"""
Extracts a datetype according to the date format
and the estimate tag based on
"""
if not text:
return " "
date_split = text.split("[")
if len(date_split) > 1:
estimate_tag = date_split[1].split("]")[0].strip()
else:
estimate_tag = None
date_object = datetime.strptime(date_split[0].strip(), date_format)
return estimate_tag, date_object
#TODO: Write test
def extract_text_and_tag(text):
"""
"""
pass
if __name__ == "__main__":
Entry = BeautifulSoup(entry1, "lxml")
Form = BeautifulSoup(form, "lxml")
print(extract_new_data(Entry.find_all("td")[1]))
print(extract_old_data(Entry.find_all("td")[1]))
for tr in Form.find_all("tr"):
data = tr.find_all("td")
match len(data):
case 0: print("no data")
case 1: print("1\t",data[0])
case _: print(len(data), "\t", extract_new_data(data[1]) ,"\t|\t", extract_old_data(data[1]))
#print(extract_date_and_tag(extract_old_data(Entry.find_all("td")[1]), "%B %Y"))
print(extract_date_and_tag("April 2008 [ test ]", "%B %Y"))
Entry2 = BeautifulSoup(entry2,"lxml")
print(extract_old_data(Entry2)) #error here.
print(extract_new_data(Entry2))
Entry3 = copy(Entry2)
print(Entry3)
Entry4 = Entry3.find_all(class_="add_hilite")[0].extract()
print(Entry3.text)
print(Entry4.text)