@ -123,14 +123,17 @@ def extract_study_statuses(study_status_form, version_a,version_b):
StudyStatusData objects ,
StudyStatusData objects ,
"""
"""
#get rows
#get rows
rows = study_status_form . table . tbody . find_all ( " tr " )
rows = study_status_form . table . tbody . find_all ( " tr " )
#iterate through rows,
#iterate through rows,
for trow in rows :
for trow in rows :
#matching on rowLabels
#matching on rowLabels
match tr_to_td ( trow ) :
match tr_to_td ( trow ) :
case [ " Primary Completion: " as row_label , tag ] :
case [ " Primary Completion: " as row_label , old , new ] :
old , new = split_by_version ( tag )
print ( " row matched: {} " . format ( row_label ) ) if VERBOSE else " "
tagdate1 = extract_date_and_tag ( old . text )
tagdate1 = extract_date_and_tag ( old . text )
version_a . _primary_completion_date = tagdate1 . date
version_a . _primary_completion_date = tagdate1 . date
version_a . _primary_completion_date_category = optional_strip ( tagdate1 . tag )
version_a . _primary_completion_date_category = optional_strip ( tagdate1 . tag )
@ -139,8 +142,8 @@ def extract_study_statuses(study_status_form, version_a,version_b):
version_b . _primary_completion_date = tagdate2 . date
version_b . _primary_completion_date = tagdate2 . date
version_b . _primary_completion_date_category = optional_strip ( tagdate2 . tag )
version_b . _primary_completion_date_category = optional_strip ( tagdate2 . tag )
case [ " Study Start: " as row_label , tag ] :
case [ " Study Start: " as row_label , old, new ] :
old , new = split_by_version ( tag )
print ( " row matched: {} " . format ( row_label ) ) if VERBOSE else " "
tagdate1 = extract_date_and_tag ( old . text )
tagdate1 = extract_date_and_tag ( old . text )
version_a . _start_date = tagdate1 . date
version_a . _start_date = tagdate1 . date
version_a . _start_date_category = optional_strip ( tagdate1 . tag )
version_a . _start_date_category = optional_strip ( tagdate1 . tag )
@ -149,21 +152,23 @@ def extract_study_statuses(study_status_form, version_a,version_b):
version_b . _start_date = tagdate2 . date
version_b . _start_date = tagdate2 . date
version_b . _start_date_category = optional_strip ( tagdate2 . tag )
version_b . _start_date_category = optional_strip ( tagdate2 . tag )
case [ " Study Completion: " as row_label , tag ] :
case [ " Study Completion: " as row_label , old, new ] :
old , new = split_by_version ( tag )
print ( " row matched: {} " . format ( row_label ) ) if VERBOSE else " "
tagdate1 = extract_date_and_tag ( old . text )
tagdate1 = extract_date_and_tag ( old . text )
version_a . _completion_date = tagdate1 . date
version_a . _completion_date = tagdate1 . date
version_a . _completion_date_category = optional_strip ( tagdate1 . tag )
version_a . _completion_date_category = optional_strip ( tagdate1 . tag )
tagdate2 = extract_date_and_tag ( new . text )
tagdate2 = extract_date_and_tag ( new . text )
version_b . _completion_date = tagdate2 . date
version_b . _completion_date = tagdate2 . date
version_b . _completion_date_category = optional_strip ( tagdate2 . tag )
version_b . _completion_date_category = optional_strip ( tagdate2 . tag )
case [ " Overall Status: " as row_label , tag ] :
case [ " Overall Status: " as row_label , old, new ] :
old , new = split_by_version ( tag )
print ( " row matched: {} " . format ( row_label ) ) if VERBOSE else " "
version_a . _overall_status = optional_strip ( old . text )
version_a . _overall_status = optional_strip ( old . text )
version_b . _overall_status = optional_strip ( new . text )
version_b . _overall_status = optional_strip ( new . text )
#FIX: There is an issue with NCT00789633 where the overall status includes information as to why it was suspended.
case _ as row_label :
print ( " row not matched: {} " . format ( row_label ) ) if VERBOSE else " "
def extract_study_design ( study_status_form , version_a , version_b ) :
def extract_study_design ( study_status_form , version_a , version_b ) :
"""
"""
@ -177,8 +182,8 @@ def extract_study_design(study_status_form, version_a,version_b):
for trow in rows :
for trow in rows :
#matching on rowLabels
#matching on rowLabels
match tr_to_td ( trow ) :
match tr_to_td ( trow ) :
case [ " Enrollment: " as row_label , tag ] :
case [ " Enrollment: " as row_label , old, new ] :
old , new = split_by_version ( tag )
print ( " row matched: {} " . format ( row_label ) ) if VERBOSE else " "
#Extract tag and text, add them to preallocated object
#Extract tag and text, add them to preallocated object
tagtext1 = extract_text_and_tag ( old . text )
tagtext1 = extract_text_and_tag ( old . text )
@ -189,6 +194,8 @@ def extract_study_design(study_status_form, version_a,version_b):
version_b . _enrollment = tagtext2 . text
version_b . _enrollment = tagtext2 . text
version_b . _enrollment_category = optional_strip ( tagtext2 . tag )
version_b . _enrollment_category = optional_strip ( tagtext2 . tag )
case _ as row_label :
print ( " row not matched: {} " . format ( row_label ) ) if VERBOSE else " "
def extract_sponsor_data ( study_status_form , version_a , version_b ) :
def extract_sponsor_data ( study_status_form , version_a , version_b ) :
"""
"""
@ -202,25 +209,30 @@ def extract_sponsor_data(study_status_form, version_a,version_b):
for trow in rows :
for trow in rows :
#matching on rowLabels
#matching on rowLabels
match tr_to_td ( trow ) :
match tr_to_td ( trow ) :
case [ " Sponsor: " as row_label , tag] :
case [ " Sponsor: " as row_label , old, new ] :
old , new = split_by_version ( tag )
print ( " row matched: {} " . format ( row_label ) ) if VERBOSE else " "
version_a . _sponsor = optional_strip ( old . text )
version_a . _sponsor = optional_strip ( old . text )
version_b . _sponsor = optional_strip ( new . text )
version_b . _sponsor = optional_strip ( new . text )
case [ " Responsible Party: " as row_label , tag ] :
case [ " Responsible Party: " as row_label , old, new ] :
old , new = split_by_version ( tag )
print ( " row matched: {} " . format ( row_label ) ) if VERBOSE else " "
version_a . _responsible_party = optional_strip ( old . text )
version_a . _responsible_party = optional_strip ( old . text )
version_b . _responsible_party = optional_strip ( new . text )
version_b . _responsible_party = optional_strip ( new . text )
case [ " Collaborators: " as row_label , tag ] :
case [ " Collaborators: " as row_label , old, new ] :
#old, new = split_by_version(tag)
print ( " row matched: {} " . format ( row_label ) ) if VERBOSE else " "
#TODO: find a trial with multiple collaborators and figure out how to identify/count them:w
#TODO: find a trial with multiple collaborators and figure out how to identify/count them:w
# So far can't figure out where this is in AACT, so I'm going to ignore it.
# So far can't figure out where this is in AACT, so I'm going to ignore it.
pass
pass
case _ as row_label :
print ( " row not matched: {} " . format ( row_label ) ) if VERBOSE else " "
def split_by_version ( tag ) :
def split_by_version ( tag ) :
'''
OUTDATED : With the new format that separates old and new versions , I don ' t technically need this. It is a nice place to identify exact changes if those are every needed though and it removes the highlights cleanly.
'''
#clone elements and remove sub-tags that are not needed.
#clone elements and remove sub-tags that are not needed.
old = copy ( tag )
old = copy ( tag )
for span in old . find_all ( class_ = " add_hilite " ) :
for span in old . find_all ( class_ = " add_hilite " ) :
@ -269,7 +281,7 @@ def extract_text_and_tag(text):
#handle various empty cases
#handle various empty cases
if not text or text == ' ' :
if not text or text == ' ' :
return Tag Date Pair( None , None )
return Tag Text Pair( None , None )
date_split = text . split ( " [ " )
date_split = text . split ( " [ " )
if len ( date_split ) > 1 :
if len ( date_split ) > 1 :
@ -282,7 +294,7 @@ def extract_text_and_tag(text):
### FUNCTIONS
### FUNCTIONS
def tr_to_td ( tr ) - > tuple [ str , str ]:
def tr_to_td ( tr ) - > tuple [ str , str , str ]:
"""
"""
Takes an html data row of interest , extracts the record_name from the first < td > , and the data from the second < td > .
Takes an html data row of interest , extracts the record_name from the first < td > , and the data from the second < td > .
@ -291,10 +303,10 @@ def tr_to_td(tr) -> tuple[str, str]:
"""
"""
#get list of cells
#get list of cells
td_list = tr . find_all ( " td " )
td_list = tr . find_all ( " td " )
if len ( td_list ) == 2 :
if len ( td_list ) == 3 :
return td_list [ 0 ] . text , td_list [ 1 ]
return td_list [ 0 ] . text , td_list [ 1 ] , td_list [ 2 ]
else :
else :
return None , None
return None , None , None
def get_forms ( soup , version_a , version_b ) :
def get_forms ( soup , version_a , version_b ) :
@ -343,8 +355,8 @@ def get_forms(soup,version_a,version_b):
pass
pass
case " form_MoreInformation " :
case " form_MoreInformation " :
pass
pass
case _ :
case _ as form_name :
print ( form . attrs [ " id " ] )
print ( " form not matched: {} " . format ( form_name ) ) if VERBOSE else " "
### CONSTANTS
### CONSTANTS
@ -365,6 +377,7 @@ def get_data_from_versions(nct_id,html, version_a_int, version_b_int):
if __name__ == " __main__ " :
if __name__ == " __main__ " :
VERBOSE = True
with psycopg2 . connect ( dbname = " aact_db " , user = " root " , password = " root " , host = " will-office " ) as db_connection :
with psycopg2 . connect ( dbname = " aact_db " , user = " root " , password = " root " , host = " will-office " ) as db_connection :
#pull the requests from the db
#pull the requests from the db
@ -376,13 +389,11 @@ if __name__ == "__main__":
"""
"""
curse . execute ( sql )
curse . execute ( sql )
for response in curse . fetchall ( ) :
for response in curse . fetchall ( ) :
#
nct_id , version_a , version_b , html = response
nct_id , version_a , version_b , html = response
print ( nct_id )
print ( nct_id , version_a , version_b ) if VERBOSE else " "
version1 , version2 = get_data_from_versions ( nct_id , html , version_a , version_b )
version1 , version2 = get_data_from_versions ( nct_id , html , version_a , version_b )
print ( version1 . nct_id )
print ( version2 . _enrollment )
if version_b == version_a + 1 :
if version_b == version_a + 1 :
version1 . load_to_db ( db_connection )
version1 . load_to_db ( db_connection )
@ -390,7 +401,6 @@ if __name__ == "__main__":
else :
else :
version2 . load_to_db ( db_connection )
version2 . load_to_db ( db_connection )
exit ( 1 )
"""
"""