from copy import copy from datetime import datetime from bs4 import BeautifulSoup import re form = """

Study Status


Record Verification:	April 2008 2017
Overall Status:	Recruiting Completed
Study Start:	March 2008
Primary Completion:	December 2009 [Actual]
Study Completion:	December 2009 [ Anticipated Actual]

First Submitted:	April 10, 2008
First Submitted that Met QC Criteria:	April 10, 2008
First Posted:	April 15, 2008 [Estimate]

Results First Submitted:	February 6, 2014
Results First Submitted that Met QC Criteria:	August 29, 2014
Results First Posted:	September 9, 2014 [Estimate]

Last Update Submitted that Met QC Criteria:	April 10 18, 2008 2017
Last Update Posted:	April 15, 2008 [Estimate] May 19, 2017 [Actual]

""" entry1 = """ Record Verification: April 2008 2017 """ entry2 = ' December 2009 [Actual] ' DROP_HILITE_re = re.compile('[\[\]\w]*\s?') ADD_HILITE_re = re.compile('\w*\s?') TAGS_RE = re.compile('<[=-_,.:;"/\w\s]+>') def extract_new_data(td): text = td.__str__() return TAGS_RE.sub("",DROP_HILITE_re.sub(" ",text)).strip() def extract_old_data(td): text = td.__str__() return TAGS_RE.sub("",ADD_HILITE_re.sub(" ",text)).strip() def delete_tags(td): text = td.__str__() return TAGS_RE.sub(" ",text).strip() def extract_date_and_tag(text, date_format): """ Extracts a datetype according to the date format and the estimate tag based on """ if not text: return " " date_split = text.split("[") if len(date_split) > 1: estimate_tag = date_split[1].split("]")[0].strip() else: estimate_tag = None date_object = datetime.strptime(date_split[0].strip(), date_format) return estimate_tag, date_object #TODO: Write test def extract_text_and_tag(text): """ """ pass if __name__ == "__main__": Entry = BeautifulSoup(entry1, "lxml") Form = BeautifulSoup(form, "lxml") print(extract_new_data(Entry.find_all("td")[1])) print(extract_old_data(Entry.find_all("td")[1])) for tr in Form.find_all("tr"): data = tr.find_all("td") match len(data): case 0: print("no data") case 1: print("1\t",data[0]) case _: print(len(data), "\t", extract_new_data(data[1]) ,"\t|\t", extract_old_data(data[1])) #print(extract_date_and_tag(extract_old_data(Entry.find_all("td")[1]), "%B %Y")) print(extract_date_and_tag("April 2008 [ test ]", "%B %Y")) Entry2 = BeautifulSoup(entry2,"lxml") print(extract_old_data(Entry2)) #error here. print(extract_new_data(Entry2)) Entry3 = copy(Entry2) print(Entry3) Entry4 = Entry3.find_all(class_="add_hilite")[0].extract() print(Entry3.text) print(Entry4.text)