@ -1,9 +1,8 @@
from collections import namedtuple
from copy import copy
from datetime import datetime
from ensurepip import version
from bs4 import BeautifulSoup
import abc
import textprocessing as tp #cuz tp is important
#requires Python 3.10
@ -50,6 +49,9 @@ class VersionData():
#self._responsible_party_category = None #I don't believe this is included in the raw data
#self._collaborators = None #currently going to ignore as I've not fount it in AACT
def load_to_db ( db_cursor ) :
#load to initial table, then load any extra details into other tables
pass
def extract_study_statuses ( study_status_form , version_a , version_b ) :
"""
@ -66,21 +68,21 @@ def extract_study_statuses(study_status_form, version_a,version_b):
match tr_to_td ( trow ) :
case [ " Primary Completion: " as row_label , tag ] :
old , new = split_by_version ( tag )
tagdate1 = extract_date_and_tag ( old . text ,date_MMMM_YYYY )
tagdate1 = extract_date_and_tag ( old . text )
version_a . _primary_completion_date = tagdate1 . date
version_a . _primary_completion_date_category = tagdate1 . tag
tagdate2 = extract_date_and_tag ( new . text ,date_MMMM_YYYY )
tagdate2 = extract_date_and_tag ( new . text )
version_b . _primary_completion_date = tagdate2 . date
version_b . _primary_completion_date_category = tagdate2 . tag
case [ " Study Completion: " as row_label , tag ] :
old , new = split_by_version ( tag )
tagdate1 = extract_date_and_tag ( old . text ,date_MMMM_YYYY )
tagdate1 = extract_date_and_tag ( old . text )
version_a . _completion_date = tagdate1 . date
version_a . _completion_date_category = tagdate1 . tag
tagdate2 = extract_date_and_tag ( new . text ,date_MMMM_YYYY )
tagdate2 = extract_date_and_tag ( new . text )
version_b . _completion_date = tagdate2 . date
version_b . _completion_date_category = tagdate2 . tag
@ -101,7 +103,6 @@ def extract_study_design(study_status_form, version_a,version_b):
#iterate through rows,
for trow in rows :
#matching on rowLabels
#print(trow.__str__()[:80])
match tr_to_td ( trow ) :
case [ " Enrollment: " as row_label , tag ] :
old , new = split_by_version ( tag )
@ -125,7 +126,6 @@ def extract_sponsor_data(study_status_form, version_a,version_b):
#iterate through rows,
for trow in rows :
#matching on rowLabels
#print(trow.__str__()[:80])
match tr_to_td ( trow ) :
case [ " Sponsor: " as row_label , tag ] :
old , new = split_by_version ( tag )
@ -157,16 +157,12 @@ def split_by_version(tag):
return old , new
def extract_date_and_tag ( text , date_format ):
def extract_date_and_tag ( text ):
"""
Extracts a datetype according to the date format
and the estimate tag based on
"""
#FIX: Currently, there are multiple (mixed) data formats in use
#Theses can exist in the same data field, in two different versions
#so instead of using a single (passed) data format, I need to
#select between various data formats.
text = text . strip ( )
@ -179,8 +175,12 @@ def extract_date_and_tag(text, date_format):
estimate_tag = date_split [ 1 ] . split ( " ] " ) [ 0 ] . strip ( )
else :
estimate_tag = None
date_object = datetime . strptime ( date_split [ 0 ] . strip ( ) , date_format )
try :
date_object = datetime . strptime ( date_split [ 0 ] . strip ( ) , date_MMMM_YYYY )
except ValueError as ve :
date_object = datetime . strptime ( date_split [ 0 ] . strip ( ) , date_MMMM_DD_YYYY )
return TagDatePair ( estimate_tag , date_object )
@ -274,14 +274,19 @@ def get_forms(soup,version_a,version_b):
date_MMMM_YYYY = " % B % Y "
date_MMMM_DD_YYYY = " % B %d , % Y "
def get_data_from_versions ( nct_id , html , version_a_int , version_b_int ) :
soup = BeautifulSoup ( html , " lxml " )
version_a = VersionData ( nct_id , version_a_int )
version_b = VersionData ( nct_id , version_b_int )
get_forms ( soup , version_a , version_b )
return version_a , version_b
if __name__ == " __main__ " :
for file in [ " ./NCT00658567.html " , " ./NCT01303796.html " ] :
with open ( file ) as fh :
soup = BeautifulSoup ( fh , " lxml " )
version1 = VersionData ( " NCT00658567 " , 1 )
version2 = VersionData ( " NCT00658567 " , 2 )
get_forms ( soup , version1 , version2 )
version1 , version2 = get_data_from_versions ( file , fh . read ( ) , 1 , 2 )
print ( version1 . __dict__ ) #order messed up somewhere:w
print ( version2 . __dict__ ) #order messed up somewhere:w