@ -1,9 +1,8 @@
from collections import namedtuple
from collections import namedtuple
from copy import copy
from copy import copy
from datetime import datetime
from datetime import datetime
from ensurepip import version
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup
import abc
import textprocessing as tp #cuz tp is important
import textprocessing as tp #cuz tp is important
#requires Python 3.10
#requires Python 3.10
@ -50,6 +49,9 @@ class VersionData():
#self._responsible_party_category = None #I don't believe this is included in the raw data
#self._responsible_party_category = None #I don't believe this is included in the raw data
#self._collaborators = None #currently going to ignore as I've not fount it in AACT
#self._collaborators = None #currently going to ignore as I've not fount it in AACT
def load_to_db ( db_cursor ) :
#load to initial table, then load any extra details into other tables
pass
def extract_study_statuses ( study_status_form , version_a , version_b ) :
def extract_study_statuses ( study_status_form , version_a , version_b ) :
"""
"""
@ -66,21 +68,21 @@ def extract_study_statuses(study_status_form, version_a,version_b):
match tr_to_td ( trow ) :
match tr_to_td ( trow ) :
case [ " Primary Completion: " as row_label , tag ] :
case [ " Primary Completion: " as row_label , tag ] :
old , new = split_by_version ( tag )
old , new = split_by_version ( tag )
tagdate1 = extract_date_and_tag ( old . text ,date_MMMM_YYYY )
tagdate1 = extract_date_and_tag ( old . text )
version_a . _primary_completion_date = tagdate1 . date
version_a . _primary_completion_date = tagdate1 . date
version_a . _primary_completion_date_category = tagdate1 . tag
version_a . _primary_completion_date_category = tagdate1 . tag
tagdate2 = extract_date_and_tag ( new . text ,date_MMMM_YYYY )
tagdate2 = extract_date_and_tag ( new . text )
version_b . _primary_completion_date = tagdate2 . date
version_b . _primary_completion_date = tagdate2 . date
version_b . _primary_completion_date_category = tagdate2 . tag
version_b . _primary_completion_date_category = tagdate2 . tag
case [ " Study Completion: " as row_label , tag ] :
case [ " Study Completion: " as row_label , tag ] :
old , new = split_by_version ( tag )
old , new = split_by_version ( tag )
tagdate1 = extract_date_and_tag ( old . text ,date_MMMM_YYYY )
tagdate1 = extract_date_and_tag ( old . text )
version_a . _completion_date = tagdate1 . date
version_a . _completion_date = tagdate1 . date
version_a . _completion_date_category = tagdate1 . tag
version_a . _completion_date_category = tagdate1 . tag
tagdate2 = extract_date_and_tag ( new . text ,date_MMMM_YYYY )
tagdate2 = extract_date_and_tag ( new . text )
version_b . _completion_date = tagdate2 . date
version_b . _completion_date = tagdate2 . date
version_b . _completion_date_category = tagdate2 . tag
version_b . _completion_date_category = tagdate2 . tag
@ -101,7 +103,6 @@ def extract_study_design(study_status_form, version_a,version_b):
#iterate through rows,
#iterate through rows,
for trow in rows :
for trow in rows :
#matching on rowLabels
#matching on rowLabels
#print(trow.__str__()[:80])
match tr_to_td ( trow ) :
match tr_to_td ( trow ) :
case [ " Enrollment: " as row_label , tag ] :
case [ " Enrollment: " as row_label , tag ] :
old , new = split_by_version ( tag )
old , new = split_by_version ( tag )
@ -125,7 +126,6 @@ def extract_sponsor_data(study_status_form, version_a,version_b):
#iterate through rows,
#iterate through rows,
for trow in rows :
for trow in rows :
#matching on rowLabels
#matching on rowLabels
#print(trow.__str__()[:80])
match tr_to_td ( trow ) :
match tr_to_td ( trow ) :
case [ " Sponsor: " as row_label , tag ] :
case [ " Sponsor: " as row_label , tag ] :
old , new = split_by_version ( tag )
old , new = split_by_version ( tag )
@ -157,16 +157,12 @@ def split_by_version(tag):
return old , new
return old , new
def extract_date_and_tag ( text , date_format ):
def extract_date_and_tag ( text ):
"""
"""
Extracts a datetype according to the date format
Extracts a datetype according to the date format
and the estimate tag based on
and the estimate tag based on
"""
"""
#FIX: Currently, there are multiple (mixed) data formats in use
#Theses can exist in the same data field, in two different versions
#so instead of using a single (passed) data format, I need to
#select between various data formats.
text = text . strip ( )
text = text . strip ( )
@ -179,8 +175,12 @@ def extract_date_and_tag(text, date_format):
estimate_tag = date_split [ 1 ] . split ( " ] " ) [ 0 ] . strip ( )
estimate_tag = date_split [ 1 ] . split ( " ] " ) [ 0 ] . strip ( )
else :
else :
estimate_tag = None
estimate_tag = None
date_object = datetime . strptime ( date_split [ 0 ] . strip ( ) , date_format )
try :
date_object = datetime . strptime ( date_split [ 0 ] . strip ( ) , date_MMMM_YYYY )
except ValueError as ve :
date_object = datetime . strptime ( date_split [ 0 ] . strip ( ) , date_MMMM_DD_YYYY )
return TagDatePair ( estimate_tag , date_object )
return TagDatePair ( estimate_tag , date_object )
@ -274,14 +274,19 @@ def get_forms(soup,version_a,version_b):
date_MMMM_YYYY = " % B % Y "
date_MMMM_YYYY = " % B % Y "
date_MMMM_DD_YYYY = " % B %d , % Y "
date_MMMM_DD_YYYY = " % B %d , % Y "
def get_data_from_versions ( nct_id , html , version_a_int , version_b_int ) :
soup = BeautifulSoup ( html , " lxml " )
version_a = VersionData ( nct_id , version_a_int )
version_b = VersionData ( nct_id , version_b_int )
get_forms ( soup , version_a , version_b )
return version_a , version_b
if __name__ == " __main__ " :
if __name__ == " __main__ " :
for file in [ " ./NCT00658567.html " , " ./NCT01303796.html " ] :
for file in [ " ./NCT00658567.html " , " ./NCT01303796.html " ] :
with open ( file ) as fh :
with open ( file ) as fh :
soup = BeautifulSoup ( fh , " lxml " )
version1 , version2 = get_data_from_versions ( file , fh . read ( ) , 1 , 2 )
version1 = VersionData ( " NCT00658567 " , 1 )
version2 = VersionData ( " NCT00658567 " , 2 )
get_forms ( soup , version1 , version2 )
print ( version1 . __dict__ ) #order messed up somewhere:w
print ( version1 . __dict__ ) #order messed up somewhere:w
print ( version2 . __dict__ ) #order messed up somewhere:w
print ( version2 . __dict__ ) #order messed up somewhere:w