|
|
|
@ -28,10 +28,11 @@ class VersionData():
|
|
|
|
|
|
|
|
|
|
|
|
It will also implement the ability to load the data to the database
|
|
|
|
It will also implement the ability to load the data to the database
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
def __init__(self,nct_id,version_id):
|
|
|
|
def __init__(self,nct_id,version_id,submission_date):
|
|
|
|
#identifiers
|
|
|
|
#identifiers
|
|
|
|
self.nct_id = nct_id.strip()
|
|
|
|
self.nct_id = nct_id.strip()
|
|
|
|
self.version_id = version_id
|
|
|
|
self.version_id = version_id
|
|
|
|
|
|
|
|
self.submission_date = submission_date
|
|
|
|
|
|
|
|
|
|
|
|
#Study Status
|
|
|
|
#Study Status
|
|
|
|
self._primary_completion_date = None
|
|
|
|
self._primary_completion_date = None
|
|
|
|
@ -56,6 +57,7 @@ class VersionData():
|
|
|
|
(
|
|
|
|
(
|
|
|
|
nct_id,
|
|
|
|
nct_id,
|
|
|
|
version,
|
|
|
|
version,
|
|
|
|
|
|
|
|
submission_date,
|
|
|
|
primary_completion_date,
|
|
|
|
primary_completion_date,
|
|
|
|
primary_completion_date_category,
|
|
|
|
primary_completion_date_category,
|
|
|
|
start_date,
|
|
|
|
start_date,
|
|
|
|
@ -82,6 +84,7 @@ class VersionData():
|
|
|
|
%s,
|
|
|
|
%s,
|
|
|
|
%s,
|
|
|
|
%s,
|
|
|
|
%s,
|
|
|
|
%s,
|
|
|
|
|
|
|
|
%s,
|
|
|
|
%s
|
|
|
|
%s
|
|
|
|
)
|
|
|
|
)
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
@ -93,6 +96,7 @@ class VersionData():
|
|
|
|
(
|
|
|
|
(
|
|
|
|
self.nct_id,
|
|
|
|
self.nct_id,
|
|
|
|
self.version_id,
|
|
|
|
self.version_id,
|
|
|
|
|
|
|
|
self.submission_date,
|
|
|
|
self._primary_completion_date,
|
|
|
|
self._primary_completion_date,
|
|
|
|
self._primary_completion_date_category,
|
|
|
|
self._primary_completion_date_category,
|
|
|
|
self._start_date,
|
|
|
|
self._start_date,
|
|
|
|
@ -113,6 +117,28 @@ class VersionData():
|
|
|
|
|
|
|
|
|
|
|
|
db_connection.commit()
|
|
|
|
db_connection.commit()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
############ Functions
|
|
|
|
|
|
|
|
def extract_submission_dates(soup):
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
Extract dates for each version
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
table_rows = soup.findChildren("fieldset")[0].table.tbody.findChildren("tr")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
version_date_dict = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for row in reversed(table_rows):
|
|
|
|
|
|
|
|
# if it is <td headers="VersionNumber">xx</td> then it contains what we need.
|
|
|
|
|
|
|
|
for td in row.findChildren("td"):
|
|
|
|
|
|
|
|
if ("headers" in td.attrs):
|
|
|
|
|
|
|
|
if (td.attrs["headers"][0]=="VersionNumber"):
|
|
|
|
|
|
|
|
version_number = int(td.text)
|
|
|
|
|
|
|
|
elif (td.attrs["headers"][0]=="VersionDate"):
|
|
|
|
|
|
|
|
version_date = td.text
|
|
|
|
|
|
|
|
version_date_dict[version_number] = datetime.strptime(version_date , "%B %d, %Y")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(version_date_dict)
|
|
|
|
|
|
|
|
return version_date_dict
|
|
|
|
|
|
|
|
|
|
|
|
def optional_strip(possible_string):
|
|
|
|
def optional_strip(possible_string):
|
|
|
|
if type(possible_string) == str:
|
|
|
|
if type(possible_string) == str:
|
|
|
|
return possible_string.strip()
|
|
|
|
return possible_string.strip()
|
|
|
|
@ -370,10 +396,14 @@ date_MMMM_DD_YYYY = "%B %d, %Y"
|
|
|
|
|
|
|
|
|
|
|
|
def get_data_from_versions(nct_id,html, version_a_int, version_b_int):
|
|
|
|
def get_data_from_versions(nct_id,html, version_a_int, version_b_int):
|
|
|
|
soup = BeautifulSoup(html,"lxml")
|
|
|
|
soup = BeautifulSoup(html,"lxml")
|
|
|
|
|
|
|
|
print(getting_data_from_versions)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
version_date_dict = extract_submission_dates(soup)
|
|
|
|
|
|
|
|
print(version_date_dict)
|
|
|
|
|
|
|
|
|
|
|
|
#preallocate version data
|
|
|
|
#preallocate version data
|
|
|
|
version_a = VersionData(nct_id, version_a_int)
|
|
|
|
version_a = VersionData(nct_id, version_a_int, version_date_dict[version_a_int])
|
|
|
|
version_b = VersionData(nct_id, version_b_int)
|
|
|
|
version_b = VersionData(nct_id, version_b_int, version_date_dict[version_b_int])
|
|
|
|
|
|
|
|
|
|
|
|
#extract data from html and put it in the preallocated objects
|
|
|
|
#extract data from html and put it in the preallocated objects
|
|
|
|
get_forms(soup, version_a, version_b)
|
|
|
|
get_forms(soup, version_a, version_b)
|
|
|
|
@ -394,6 +424,7 @@ def run():
|
|
|
|
curse.execute(sql)
|
|
|
|
curse.execute(sql)
|
|
|
|
for response in tqdm(curse.fetchall()):
|
|
|
|
for response in tqdm(curse.fetchall()):
|
|
|
|
nct_id, version_a, version_b, html = response
|
|
|
|
nct_id, version_a, version_b, html = response
|
|
|
|
|
|
|
|
print(nct_id)
|
|
|
|
|
|
|
|
|
|
|
|
print(nct_id, version_a, version_b) if VERBOSE else ""
|
|
|
|
print(nct_id, version_a, version_b) if VERBOSE else ""
|
|
|
|
|
|
|
|
|
|
|
|
|