Merge branch 'main' of ssh://gitea.kgjk.icu:3022/Research/ClinicalTrialsDataProcessing into main

llm-extraction
youainti 3 years ago
commit b7290c271b

@ -0,0 +1,64 @@
/* How many trials were included?
* How many trial were inspected?
* How many trials were reserved for downloaded?
* How many trials didn't get included for some technical reason?
*
********* Data from 2023-03-29 ***********
Of Interest 1981
Reserved 1709 #I believe this is lower than the downloaded number because I reserved them earlier
Downloaded 1960
Incomplete 3 #there were are few http 500 and 404 codes
******************************************
* Note there were 21 missing trials of interest.
* */
select status,count(distinct nct_id) from http.download_status ds
group by status;
/* Get a list of trials
* -- There are currently 304 trials for which I was able to extract unique snapshots (2023-03-29)
* */
select count(distinct nct_id) from history.trial_snapshots ts
/* Get the number of listed conditions
* -- There are only 609 listed (MeSH classified) conditions from 284 trials
* I may need to expand how I address conditions
*/
select count(*)
from ctgov.browse_conditions bc
where
mesh_type = 'mesh-list'
and
nct_id in (select distinct nct_id from history.trial_snapshots ts)
;
select count(distinct nct_id)
from ctgov.browse_conditions bc
where
mesh_type = 'mesh-list'
and
nct_id in (select distinct nct_id from history.trial_snapshots ts)
;
/*
* If I were to expand that to non-coded conditions that would be
* 304 trials with 398 conditions
* */
select count(distinct nct_id)
from ctgov.conditions bc
where
nct_id in (select distinct nct_id from history.trial_snapshots ts)
;
select count(*) from ctgov.conditions c
where
nct_id in (select distinct nct_id from history.trial_snapshots ts)
/* Get the number of matches from UMLS
* There are about 5,808 proposed matches.
*
*/
select count(*) from "DiseaseBurden".trial_to_icd10 tti ;
--1383 before run at 8pm 2023-03-29

@ -0,0 +1,32 @@
select * from "DiseaseBurden".icd10_to_cause itc ;
select * from "DiseaseBurden".cause c ;
select
select c.id, count(distinct code)
from "DiseaseBurden".cause c
join "DiseaseBurden".icd10_to_cause itc
on c.cause = itc.cause_text
group by c.id
order by c.id
;
select count(distinct nct_id) from "DiseaseBurden".trial_to_icd10 tti
where tti.approved = 'accepted';
select nct_id, "condition", ui
from "DiseaseBurden".trial_to_icd10 tti
where tti.approved = 'accepted';
select tti.nct_id, tti."condition",itc.cause_text
from "DiseaseBurden".trial_to_icd10 tti
join "DiseaseBurden".icd10_to_cause itc
on replace(REPLACE(tti.ui,'-',''),'.','') = replace(REPLACE(itc.code ,'-',''),'.','')
where
tti.approved = 'accepted'
and
itc.cause_text not in ('Non-communicable diseases','Neoplasms','Mental disorders','Other non-communicable diseases')
group by tti.nct_id , tti."condition" , itc.cause_text
order by nct_id
;

@ -0,0 +1 @@
waitress-serve --port=5000 --call 'Icd10ConditionsMatching:create_app'

@ -28,10 +28,11 @@ class VersionData():
It will also implement the ability to load the data to the database It will also implement the ability to load the data to the database
""" """
def __init__(self,nct_id,version_id): def __init__(self,nct_id,version_id,submission_date):
#identifiers #identifiers
self.nct_id = nct_id.strip() self.nct_id = nct_id.strip()
self.version_id = version_id self.version_id = version_id
self.submission_date = submission_date
#Study Status #Study Status
self._primary_completion_date = None self._primary_completion_date = None
@ -56,6 +57,7 @@ class VersionData():
( (
nct_id, nct_id,
version, version,
submission_date,
primary_completion_date, primary_completion_date,
primary_completion_date_category, primary_completion_date_category,
start_date, start_date,
@ -82,6 +84,7 @@ class VersionData():
%s, %s,
%s, %s,
%s, %s,
%s,
%s %s
) )
""" """
@ -93,6 +96,7 @@ class VersionData():
( (
self.nct_id, self.nct_id,
self.version_id, self.version_id,
self.submission_date,
self._primary_completion_date, self._primary_completion_date,
self._primary_completion_date_category, self._primary_completion_date_category,
self._start_date, self._start_date,
@ -113,6 +117,28 @@ class VersionData():
db_connection.commit() db_connection.commit()
############ Functions
def extract_submission_dates(soup):
"""
Extract dates for each version
"""
table_rows = soup.findChildren("fieldset")[0].table.tbody.findChildren("tr")
version_date_dict = {}
for row in reversed(table_rows):
# if it is <td headers="VersionNumber">xx</td> then it contains what we need.
for td in row.findChildren("td"):
if ("headers" in td.attrs):
if (td.attrs["headers"][0]=="VersionNumber"):
version_number = int(td.text)
elif (td.attrs["headers"][0]=="VersionDate"):
version_date = td.text
version_date_dict[version_number] = datetime.strptime(version_date , "%B %d, %Y")
print(version_date_dict)
return version_date_dict
def optional_strip(possible_string): def optional_strip(possible_string):
if type(possible_string) == str: if type(possible_string) == str:
return possible_string.strip() return possible_string.strip()
@ -370,10 +396,14 @@ date_MMMM_DD_YYYY = "%B %d, %Y"
def get_data_from_versions(nct_id,html, version_a_int, version_b_int): def get_data_from_versions(nct_id,html, version_a_int, version_b_int):
soup = BeautifulSoup(html,"lxml") soup = BeautifulSoup(html,"lxml")
print(getting_data_from_versions)
version_date_dict = extract_submission_dates(soup)
print(version_date_dict)
#preallocate version data #preallocate version data
version_a = VersionData(nct_id, version_a_int) version_a = VersionData(nct_id, version_a_int, version_date_dict[version_a_int])
version_b = VersionData(nct_id, version_b_int) version_b = VersionData(nct_id, version_b_int, version_date_dict[version_b_int])
#extract data from html and put it in the preallocated objects #extract data from html and put it in the preallocated objects
get_forms(soup, version_a, version_b) get_forms(soup, version_a, version_b)
@ -394,6 +424,7 @@ def run():
curse.execute(sql) curse.execute(sql)
for response in tqdm(curse.fetchall()): for response in tqdm(curse.fetchall()):
nct_id, version_a, version_b, html = response nct_id, version_a, version_b, html = response
print(nct_id)
print(nct_id, version_a, version_b) if VERBOSE else "" print(nct_id, version_a, version_b) if VERBOSE else ""

Loading…
Cancel
Save