diff --git a/scripts/drugtools/historical_nct_extractor.py b/scripts/drugtools/historical_nct_extractor.py
index f1da033..ab62bf3 100644
--- a/scripts/drugtools/historical_nct_extractor.py
+++ b/scripts/drugtools/historical_nct_extractor.py
@@ -28,10 +28,11 @@ class VersionData():
It will also implement the ability to load the data to the database
"""
- def __init__(self,nct_id,version_id):
+ def __init__(self,nct_id,version_id,submission_date):
#identifiers
self.nct_id = nct_id.strip()
self.version_id = version_id
+ self.submission_date = submission_date
#Study Status
self._primary_completion_date = None
@@ -56,6 +57,7 @@ class VersionData():
(
nct_id,
version,
+ submission_date,
primary_completion_date,
primary_completion_date_category,
start_date,
@@ -82,6 +84,7 @@ class VersionData():
%s,
%s,
%s,
+ %s,
%s
)
"""
@@ -93,6 +96,7 @@ class VersionData():
(
self.nct_id,
self.version_id,
+ self.submission_date,
self._primary_completion_date,
self._primary_completion_date_category,
self._start_date,
@@ -113,6 +117,28 @@ class VersionData():
db_connection.commit()
+############ Functions
+def extract_submission_dates(soup):
+ """
+ Extract dates for each version
+ """
+ table_rows = soup.findChildren("fieldset")[0].table.tbody.findChildren("tr")
+
+ version_date_dict = {}
+
+ for row in reversed(table_rows):
+ # if it is
xx | then it contains what we need.
+ for td in row.findChildren("td"):
+ if ("headers" in td.attrs):
+ if (td.attrs["headers"][0]=="VersionNumber"):
+ version_number = int(td.text)
+ elif (td.attrs["headers"][0]=="VersionDate"):
+ version_date = td.text
+ version_date_dict[version_number] = datetime.strptime(version_date , "%B %d, %Y")
+
+ print(version_date_dict)
+ return version_date_dict
+
def optional_strip(possible_string):
if type(possible_string) == str:
return possible_string.strip()
@@ -370,10 +396,14 @@ date_MMMM_DD_YYYY = "%B %d, %Y"
def get_data_from_versions(nct_id,html, version_a_int, version_b_int):
soup = BeautifulSoup(html,"lxml")
+ print(getting_data_from_versions)
+
+ version_date_dict = extract_submission_dates(soup)
+ print(version_date_dict)
#preallocate version data
- version_a = VersionData(nct_id, version_a_int)
- version_b = VersionData(nct_id, version_b_int)
+ version_a = VersionData(nct_id, version_a_int, version_date_dict[version_a_int])
+ version_b = VersionData(nct_id, version_b_int, version_date_dict[version_b_int])
#extract data from html and put it in the preallocated objects
get_forms(soup, version_a, version_b)
@@ -394,6 +424,7 @@ def run():
curse.execute(sql)
for response in tqdm(curse.fetchall()):
nct_id, version_a, version_b, html = response
+ print(nct_id)
print(nct_id, version_a, version_b) if VERBOSE else ""