diff --git a/AACT_downloader/ClinicalTrialHistory/docker-entrypoint-initdb.d/020_HttpSchema.sql b/AACT_downloader/ClinicalTrialHistory/docker-entrypoint-initdb.d/020_HttpSchema.sql index 3b6e204..9a1999b 100644 --- a/AACT_downloader/ClinicalTrialHistory/docker-entrypoint-initdb.d/020_HttpSchema.sql +++ b/AACT_downloader/ClinicalTrialHistory/docker-entrypoint-initdb.d/020_HttpSchema.sql @@ -22,20 +22,13 @@ As not every request will have an xml doc, split them. CREATE TABLE IF NOT EXISTS http.responses ( id SERIAL PRIMARY KEY, - nct VARCHAR(15), - version SMALLINT, + nct_id VARCHAR(15), + version_a SMALLINT, + version_b SMALLINT, url VARCHAR(255), response_code SMALLINT, - response_date DATE + response_date TIMESTAMP WITH TIME ZONE, + html TEXT ); -CREATE TABLE IF NOT EXISTS http.xml_documents ( - id SERIAL PRIMARY KEY, - xml XML, - CONSTRAINT http_response - FOREIGN KEY (id) - REFERENCES http.responses (id) - ON DELETE CASCADE --remove xml if the request is deleted -); - diff --git a/downloader/db_connection.py b/downloader/db_connection.py index a1da827..29651c8 100644 --- a/downloader/db_connection.py +++ b/downloader/db_connection.py @@ -4,8 +4,8 @@ conn = psyco.connect(dbname="aact_db", user="root", host="localhost", password=" curse = conn.cursor() -curse.execute("SELECT * FROM ctgov.studies LIMIT 2;") +curse.execute("select * FROM http.responses") print(curse.fetchall()) curse.close() -conn.close() \ No newline at end of file +conn.close() diff --git a/downloader/downloader.py b/downloader/downloader.py new file mode 100644 index 0000000..15f72c4 --- /dev/null +++ b/downloader/downloader.py @@ -0,0 +1,96 @@ +import requests +import psycopg2 as psyco +from datetime import datetime +from bs4 import BeautifulSoup + + +def get_highest_version_number(response): + #navigate to a specific part of the returned html and extract the highest posted version. + soup = BeautifulSoup(response.text, features="lxml") + version_value = soup.findChildren("fieldset")[0].table.tbody.findChildren("tr")[-1].td.text + return int(version_value) + +def make_request(nct_id,version1,version2): + #create url + baseurl = "https://clinicaltrials.gov/ct2/history/{}?A={}&B={}&C=Side-by-Side" + url = baseurl.format(nct_id,version1,version2) + + #make request + response = requests.get(url) + + #return the response + return response + +def upload_response(db_conn, nct_id, version_a, version_b, response): + + timestamp = datetime.strptime(r.headers['date'], "%a, %d %b %Y %H:%M:%S %Z") + + #this uploads the response values. + db_conn.execute(""" + INSERT INTO http.responses + (nct_id,version_a,version_b,url,response_code,response_date, html) + VALUES (%s,%s,%s,%s,%s,%s,%s) + RETURNING id + ; + """ + ,(nct_id + ,version_a + ,version_b + ,response.url + ,response.status_code + ,datetime.isoformat(timestamp) + ,response.text)) #FIX: there is an issue with storing the tags for some reason. + + pk_response = cursor.fetchall() + + return pk_response + + +def download_trial_records(nct_id, db_conn): + #download first 2 records + r = make_request(nct_id,1,2) + upload_response(db_conn, nct_id,1,2,r) + + #extract last version + v = get_highest_version_number(r) + + #download the remaining versions + if v == 2: + return None + elif v % 2 == 0: + for version_a, version_b in step_generator(v): + r = make_request(nct_id, version_a, version_b) + upload_response(db_conn,nct_id,version_a, version_b, r) + + elif v % 2 == 1: + for version_a, version_b in step_generator(v): + r = make_request(nct_id, version_a, version_b) + upload_response(db_conn,nct_id,version_a, version_b, r) + + r = make_request(nct_id, 1, v) + upload_response(db_conn,nct_id,1, v, r) + +def step_generator(max_version): + old=3 + for i in range(4,max_version,2): + yield (old,i) + old = i + 1 + +if __name__ == "__main__": + + #make request + nct_id = "NCT00658567" + r = make_request(nct_id,1,36) + print(r.url) + print(r.status_code) + + v = get_highest_version_number(r) + print("highest version", v ) + + #db connection + with psyco.connect(dbname="aact_db", user="root", host="localhost", password="root") as con: + with con.cursor() as cursor: + #insert response + + download_trial_records(nct_id,cursor) + print("download complete")