From c4b8484cabed1f99aab9fe53a6d125610f02af95 Mon Sep 17 00:00:00 2001 From: will king Date: Fri, 31 Mar 2023 16:57:02 -0700 Subject: [PATCH] added submission date to extractor --- scripts/drugtools/historical_nct_extractor.py | 37 +++++++++++++++++-- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/scripts/drugtools/historical_nct_extractor.py b/scripts/drugtools/historical_nct_extractor.py index f1da033..ab62bf3 100644 --- a/scripts/drugtools/historical_nct_extractor.py +++ b/scripts/drugtools/historical_nct_extractor.py @@ -28,10 +28,11 @@ class VersionData(): It will also implement the ability to load the data to the database """ - def __init__(self,nct_id,version_id): + def __init__(self,nct_id,version_id,submission_date): #identifiers self.nct_id = nct_id.strip() self.version_id = version_id + self.submission_date = submission_date #Study Status self._primary_completion_date = None @@ -56,6 +57,7 @@ class VersionData(): ( nct_id, version, + submission_date, primary_completion_date, primary_completion_date_category, start_date, @@ -82,6 +84,7 @@ class VersionData(): %s, %s, %s, + %s, %s ) """ @@ -93,6 +96,7 @@ class VersionData(): ( self.nct_id, self.version_id, + self.submission_date, self._primary_completion_date, self._primary_completion_date_category, self._start_date, @@ -113,6 +117,28 @@ class VersionData(): db_connection.commit() +############ Functions +def extract_submission_dates(soup): + """ + Extract dates for each version + """ + table_rows = soup.findChildren("fieldset")[0].table.tbody.findChildren("tr") + + version_date_dict = {} + + for row in reversed(table_rows): + # if it is xx then it contains what we need. + for td in row.findChildren("td"): + if ("headers" in td.attrs): + if (td.attrs["headers"][0]=="VersionNumber"): + version_number = int(td.text) + elif (td.attrs["headers"][0]=="VersionDate"): + version_date = td.text + version_date_dict[version_number] = datetime.strptime(version_date , "%B %d, %Y") + + print(version_date_dict) + return version_date_dict + def optional_strip(possible_string): if type(possible_string) == str: return possible_string.strip() @@ -370,10 +396,14 @@ date_MMMM_DD_YYYY = "%B %d, %Y" def get_data_from_versions(nct_id,html, version_a_int, version_b_int): soup = BeautifulSoup(html,"lxml") + print(getting_data_from_versions) + + version_date_dict = extract_submission_dates(soup) + print(version_date_dict) #preallocate version data - version_a = VersionData(nct_id, version_a_int) - version_b = VersionData(nct_id, version_b_int) + version_a = VersionData(nct_id, version_a_int, version_date_dict[version_a_int]) + version_b = VersionData(nct_id, version_b_int, version_date_dict[version_b_int]) #extract data from html and put it in the preallocated objects get_forms(soup, version_a, version_b) @@ -394,6 +424,7 @@ def run(): curse.execute(sql) for response in tqdm(curse.fetchall()): nct_id, version_a, version_b, html = response + print(nct_id) print(nct_id, version_a, version_b) if VERBOSE else ""