added submission date to extractor

llm-extraction v1.0.0
will king 3 years ago
parent 9a718f72a0
commit 3530dc96b6

@ -28,10 +28,11 @@ class VersionData():
It will also implement the ability to load the data to the database
"""
def __init__(self,nct_id,version_id):
def __init__(self,nct_id,version_id,submission_date):
#identifiers
self.nct_id = nct_id.strip()
self.version_id = version_id
self.submission_date = submission_date
#Study Status
self._primary_completion_date = None
@ -56,6 +57,7 @@ class VersionData():
(
nct_id,
version,
submission_date,
primary_completion_date,
primary_completion_date_category,
start_date,
@ -82,6 +84,7 @@ class VersionData():
%s,
%s,
%s,
%s,
%s
)
"""
@ -93,6 +96,7 @@ class VersionData():
(
self.nct_id,
self.version_id,
self.submission_date,
self._primary_completion_date,
self._primary_completion_date_category,
self._start_date,
@ -113,6 +117,28 @@ class VersionData():
db_connection.commit()
############ Functions
def extract_submission_dates(soup):
"""
Extract dates for each version
"""
table_rows = soup.findChildren("fieldset")[0].table.tbody.findChildren("tr")
version_date_dict = {}
for row in reversed(table_rows):
# if it is <td headers="VersionNumber">xx</td> then it contains what we need.
for td in row.findChildren("td"):
if ("headers" in td.attrs):
if (td.attrs["headers"][0]=="VersionNumber"):
version_number = int(td.text)
elif (td.attrs["headers"][0]=="VersionDate"):
version_date = td.text
version_date_dict[version_number] = datetime.strptime(version_date , "%B %d, %Y")
print(version_date_dict)
return version_date_dict
def optional_strip(possible_string):
if type(possible_string) == str:
return possible_string.strip()
@ -370,10 +396,14 @@ date_MMMM_DD_YYYY = "%B %d, %Y"
def get_data_from_versions(nct_id,html, version_a_int, version_b_int):
soup = BeautifulSoup(html,"lxml")
print(getting_data_from_versions)
version_date_dict = extract_submission_dates(soup)
print(version_date_dict)
#preallocate version data
version_a = VersionData(nct_id, version_a_int)
version_b = VersionData(nct_id, version_b_int)
version_a = VersionData(nct_id, version_a_int, version_date_dict[version_a_int])
version_b = VersionData(nct_id, version_b_int, version_date_dict[version_b_int])
#extract data from html and put it in the preallocated objects
get_forms(soup, version_a, version_b)
@ -394,6 +424,7 @@ def run():
curse.execute(sql)
for response in tqdm(curse.fetchall()):
nct_id, version_a, version_b, html = response
print(nct_id)
print(nct_id, version_a, version_b) if VERBOSE else ""

Loading…
Cancel
Save