Merging work from other computer into home.

Merge branch 'main' of ssh://gitea.kgjk.icu:3022/Research/ClinicalTrialsDataProcessing
3 years ago · c5f3bfcdec
parent 12c3c69304 29644a0ad5
commit c5f3bfcdec
25 changed files with 474347 additions and 14 deletions
--- a/containers/AACT_downloader/docker-entrypoint-initdb.d/999_ntfy.sh
+++ b/containers/AACT_downloader/docker-entrypoint-initdb.d/999_ntfy.sh
@ -1,2 +0,0 @@
-#!/bin/bash
-wget --post-data="postgres complete:$(date)" -qO- https://ntfy.sh/$NTFY > /dev/null
--- a/containers/RxNav-In-a-box/rxnav-in-a-box-20230103/mysql/999_ntfy.sh
+++ b/containers/RxNav-In-a-box/rxnav-in-a-box-20230103/mysql/999_ntfy.sh
@ -1,6 +0,0 @@
-#!/bin/bash
-#install wget
-apt update
-apt install -y wget
-#send notification
-wget --post-data="mariadb complete:$(date)" -qO- https://ntfy.sh/$NTFY > /dev/null
--- a/containers/docker-compose.yaml
+++ b/containers/docker-compose.yaml
@ -26,7 +26,7 @@ services:
      - ./AACT_downloader/aact_downloads/postgres_data.dmp:/mnt/host_data/postgres_data.dmp
      # this is the folder containing entrypoint info.
      - ./AACT_downloader/docker-entrypoint-initdb.d/:/docker-entrypoint-initdb.d/
-    shm-size: 1g
+    shm_size: 512mb


  rxnav-db:
--- a/development_sql/ASSOICATING
+++ b/development_sql/ASSOICATING
@ -34,6 +34,53 @@ WHERE
 group by bi.nct_id, bi.downcase_mesh_term , rr.tty2 ,rr.rxcui2 
 order by bi.nct_id 
 ;
+--running out of space.
+
+-- get list of interventions assoicated with trials of interest
+create temp table tmp_interventions as
+select * from ctgov.browse_interventions bi 
+where 
+	bi.mesh_type ='mesh-list'
+	and
+	bi.nct_id in (select distinct nct_id from history.trial_snapshots)
+;
+select * from tmp_interventions;
+
+--drop table tmp_join_interv_rxcui;
+create temp table tmp_join_interv_rxcui as
+select *
+from
+	tmp_interventions tint
+	inner join
+	rxnorm_migrated.rxnorm_props rp 
+		on tint.downcase_mesh_term = rp.propvalue1 
+where propname='RxNorm Name'
+;-- get the rxcui for ingredients
+
+select * from tmp_join_interv_rxcui;
+
+--filter rxcui -> is human prescribable
+create temp view tmp_view_prescribable as
+select count(*) from rxnorm_migrated.rxnorm_props rp 
+where 
+	rp.propname = 'PRESCRIBABLE' 
+	and 
+	rp.propvalue1 = 'Y'
+;
+
+--link prescribable to brand ingredients or brand names.
+	
+
+--get relationships of IN -> BN
+select *
+from 
+	rxnorm_migrated.rxnorm_relations rr 
+where 
+	rr.tty1 in ('IN','MIN')
+	and rr.rxcui1 in (select distinct rxcui from tmp_join_interv_rxcui tjir)
+	and rr.tty2 = 'BN'
+;
+


 --match trials to through brands NDC11
--- a/version)/COD_cause2code.psv
+++ b/version)/COD_cause2code.psv
--- a/version)/GlobalBurdenDisease/IHME_GBD_2019_COD_CAUSE_ICD_CODE_MAP_Y2020M10D15.XLSX
+++ b/version)/GlobalBurdenDisease/IHME_GBD_2019_COD_CAUSE_ICD_CODE_MAP_Y2020M10D15.XLSX
--- a/version)/GlobalBurdenDisease/IHME_GBD_2019_NONFATAL_CAUSE_ICD_CODE_MAP_Y2020M10D15.XLSX
+++ b/version)/GlobalBurdenDisease/IHME_GBD_2019_NONFATAL_CAUSE_ICD_CODE_MAP_Y2020M10D15.XLSX
--- a/version)/ICD10-to-GDB_expander.py
+++ b/version)/ICD10-to-GDB_expander.py
@ -0,0 +1,103 @@
+import pandas as pd
+import numpy as np
+import itertools
+
+
+
+IHME_COD_FILEPATH = "./GlobalBurdenDisease/IHME_GBD_2019_COD_CAUSE_ICD_CODE_MAP_Y2020M10D15.XLSX"
+IHME_NONFATAL_FILEPATH = "./GlobalBurdenDisease/IHME_GBD_2019_NONFATAL_CAUSE_ICD_CODE_MAP_Y2020M10D15.XLSX"
+ICD10CM_ORDER_FILEPATH = "./icd10_combined-who-cms.psv"
+
+def justify(string):
+    '''
+    The purpose of this is to transform codes such as A00 and A000
+    into a normalized, sortable format e.g. 'A00----' and 'A000---'
+    '''
+    return string.ljust(7,"-")
+
+
+class CodeRange():
+    def __init__(self,cause,code_book,codes):
+        self.cause = cause
+        self.code_book = code_book
+        self.code_list = []
+
+        codes = "" if type(codes) is float else codes #normalize codes to string...
+        codes = [x.strip().replace('.','').split('-') for x in codes.split(",")]
+        
+        for rng in codes:
+
+            if rng[0] is None:
+                raise Exception("Listed ICD10 Code (Begin:{}) is not in codebook".format(rng), rng)
+
+            #lookup codes
+            if len(rng) == 1:
+                begin = justify(rng[0])
+                if self.code_book.get(begin) is None:
+                    continue
+                else:
+                    self.code_list.append(begin)
+            else:
+                begin = justify(rng[0])
+                end = justify(rng[1])
+                begin_bitmask = [x >= begin for x in list(self.code_book)]
+                end_bitmask = [x <= end for x in list(self.code_book)]
+
+                bitmask = [x and y for x,y in zip(begin_bitmask,end_bitmask)]
+                
+                self.code_list.extend(list(itertools.compress(list(self.code_book),bitmask)))
+
+
+
+    def __str__(self):
+        txt = ''
+        for item in self.code_list:
+            txt += "{} | {}\n".format(item, self.cause)
+
+        return txt
+
+#READ in ICD10CM codes
+
+icd10_codes = {}
+
+
+with open(ICD10CM_ORDER_FILEPATH,"r") as icd_fh:
+    for idx,line in enumerate(icd_fh.readlines()):
+        #read info
+        code, descr, source = line.split("|")
+        #cleanup info
+        code = justify(code.strip())
+        descr = descr.strip()
+        source = source.strip()
+
+        #Store in code dict
+        icd10_codes[code] = (idx,descr, source)
+
+
+
+cod = pd.read_excel(IHME_COD_FILEPATH,header=1)
+
+with open("COD_cause2code.psv", "w") as outfh:
+    itt = 0
+    for row in cod.itertuples():
+        cause = row[1]
+        codes = row[2]
+        
+        c = CodeRange(cause,icd10_codes,codes)
+
+        outfh.write(c.__str__())
+
+
+
+
+nonfatal = pd.read_excel(IHME_NONFATAL_FILEPATH,header=1)
+with open("NONFATAL_cause2code.psv", "w") as outfh:
+    itt = 0
+    for row in nonfatal.itertuples():
+        cause = row[2]
+        codes= row[3]
+        c = CodeRange(cause,icd10_codes,codes)
+
+        outfh.write(c.__str__())
+
+
--- a/version)/NONFATAL_cause2code.psv
+++ b/version)/NONFATAL_cause2code.psv
--- a/non-db_data_sources/GBD
+++ b/non-db_data_sources/GBD
@ -0,0 +1,24 @@
+This data was obtained by opening each nested portion on the
+left navigation bar at 
+    https://icd.who.int/browse10/2019/en
+and then copying and pasting the data into a text file (icd10-2019.txt). 
+
+
+This text file was then adjusted to get the pipe-separated values version 
+which 
+has the following columns
+- code: the icd-10 code in a normalized format
+- description: The basic description given
+- source: This just says WHO so that it is possible to merge it with other 
+sources.
+
+
+The adjustments were as follows (parenthases include vim search and replace 
+commands used):
+- delete tabls (:%s/\t//g)
+- delete leading spaces (:%s/^\s//)
+- remove excess newlines (:%s/^\n//)
+- remove periods in codes (:%s/\.//)
+- Convert to Pipe-separated values file (:%s/\s/ | /)
+- add column of sources (:%s/\s*$/ | WHO)
+- Type in column headers
--- a/non-db_data_sources/GBD
+++ b/non-db_data_sources/GBD
--- a/(2019)/icd10-2019_categories_only.psv
+++ b/(2019)/icd10-2019_categories_only.psv
--- a/(2019)/icd10-2019_categories_only.txt
+++ b/(2019)/icd10-2019_categories_only.txt
--- a/version)/icd10_combined-who-cms.psv
+++ b/version)/icd10_combined-who-cms.psv
--- a/version)/icd10cm_codes_addenda_2019.zip
+++ b/version)/icd10cm_codes_addenda_2019.zip
--- a/version)/icd10cm_codes_addenda_2019/README.txt
+++ b/version)/icd10cm_codes_addenda_2019/README.txt
@ -0,0 +1,6 @@
+This contains the CMS version of ICD-10-CM codes.
+I have included a version I converted to pip-separated values with the 
+following columns
+- code : the IDC-10-cm code.
+- description: a basic description
+- source: Says CMS-cm so that it can be combined with other sources.
--- a/version)/icd10cm_codes_addenda_2019/icd10cmCodesFile.pdf
+++ b/version)/icd10cm_codes_addenda_2019/icd10cmCodesFile.pdf
--- a/version)/icd10cm_codes_addenda_2019/icd10cm_codes_2019.psv
+++ b/version)/icd10cm_codes_addenda_2019/icd10cm_codes_2019.psv
--- a/version)/icd10cm_codes_addenda_2019/icd10cm_codes_2019.txt
+++ b/version)/icd10cm_codes_addenda_2019/icd10cm_codes_2019.txt
--- a/version)/merge_icd-versions.sh
+++ b/version)/merge_icd-versions.sh
@ -0,0 +1,10 @@
+#!/bin/bash
+
+icd10_who="./WHO ICD-10 (2019)/icd10-2019_categories_only.psv"
+icd10cm_cms="./icd10cm_codes_addenda_2019/icd10cm_codes_2019.psv"
+
+#concatenate the two files
+#then lexically sort them by first column and then second column(reversed)
+#then sort/unique based on first column
+#then save to file
+cat "$icd10_who" "$icd10cm_cms" | sort -t "|" -k 1,1 -k 3,3r | sort -u -t "|" -k 1,1 > icd10_combined-who-cms.psv
--- a/scripts/drugtools/env_setup.py
+++ b/scripts/drugtools/env_setup.py
@ -1,5 +1,6 @@
 import pymysql
 import psycopg2 as psyco
+from psycopg2.sql import SQL
 from dotenv import dotenv_values

 env_path = "../containers/.env"
@ -28,3 +29,15 @@ def postgres_conn(**kwargs):

 def get_tables_of_interest():
    return ENV["TABLES_OF_INTEREST"].split(",")
+
+def postgres_table_delete_entries(schema,table):
+    with postgres_conn() as con:
+        with con.cursor() as curse:
+            delete_statement = SQL("delete from {schema}.{table}").format(
+                schema=Identifier(schema),
+                talbe=Identifier(table)
+                )
+            curse.execute(delete_statement)
+            con.commit()
+
+
--- a/scripts/drugtools/historical_trial_selector.py
+++ b/scripts/drugtools/historical_trial_selector.py
@ -1,4 +1,4 @@
-from drugtools.env_setup import postgres_conn
+from .env_setup import postgres_conn
 from pathlib import Path


--- a/scripts/drugtools/migrate_mysql2pgsql.py
+++ b/scripts/drugtools/migrate_mysql2pgsql.py
@ -4,7 +4,7 @@ from psycopg2 import extras
 import pymysql
 from dotenv import load_dotenv
 import os
-from drugtools.env_setup import postgres_conn, mariadb_conn, get_tables_of_interest
+from .env_setup import postgres_conn, mariadb_conn, get_tables_of_interest


 ##############NOTE
--- a/scripts/rm_data.sh
+++ b/scripts/rm_data.sh
@ -0,0 +1,5 @@
+#!/bin/bash
+
+rm -r ../containers/RxNav-In-a-box/rxnav_data/*
+
+rm -r ../containers/AACT_downloader/postgresql/data
--- a/scripts/runall.py
+++ b/scripts/runall.py
@ -0,0 +1,20 @@
+from drugtools import env_setup
+from drugtools import historical_trial_selector as hts
+from drugtools import historical_nct_downloader as hnd
+from drugtools import historical_nct_extractor as hne
+from drugtools import download_and_extract_nsde as daen
+from drugtools import migrate_mysql2pgsql as mm2p
+
+print("Current Environment")
+print(env_setup.ENV)
+
+cont = input("Are you willing to continue with the current environmnet? y/[n]")
+
+if cont == "Y" or cont == "y":
+    hts.run()
+    hnd.run()
+    hne.run()
+    daen.run()
+    mm2p.run()
+else:
+    print("Please fix your .env file and try again")