You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ClinicalTrialsDataProcessing/downloader/downloader.py

97 lines
2.7 KiB
Python

import requests
import psycopg2 as psyco
from datetime import datetime
from bs4 import BeautifulSoup
def get_highest_version_number(response):
#navigate to a specific part of the returned html and extract the highest posted version.
soup = BeautifulSoup(response.text, features="lxml")
version_value = soup.findChildren("fieldset")[0].table.tbody.findChildren("tr")[-1].td.text
return int(version_value)
def make_request(nct_id,version1,version2):
#create url
baseurl = "https://clinicaltrials.gov/ct2/history/{}?A={}&B={}&C=Side-by-Side"
url = baseurl.format(nct_id,version1,version2)
#make request
response = requests.get(url)
#return the response
return response
def upload_response(db_conn, nct_id, version_a, version_b, response):
timestamp = datetime.strptime(r.headers['date'], "%a, %d %b %Y %H:%M:%S %Z")
#this uploads the response values.
db_conn.execute("""
INSERT INTO http.responses
(nct_id,version_a,version_b,url,response_code,response_date, html)
VALUES (%s,%s,%s,%s,%s,%s,%s)
RETURNING id
;
"""
,(nct_id
,version_a
,version_b
,response.url
,response.status_code
,datetime.isoformat(timestamp)
,response.text)) #FIX: there is an issue with storing the tags for some reason.
pk_response = cursor.fetchall()
return pk_response
def download_trial_records(nct_id, db_conn):
#download first 2 records
r = make_request(nct_id,1,2)
upload_response(db_conn, nct_id,1,2,r)
#extract last version
v = get_highest_version_number(r)
#download the remaining versions
if v == 2:
return None
elif v % 2 == 0:
for version_a, version_b in step_generator(v):
r = make_request(nct_id, version_a, version_b)
upload_response(db_conn,nct_id,version_a, version_b, r)
elif v % 2 == 1:
for version_a, version_b in step_generator(v):
r = make_request(nct_id, version_a, version_b)
upload_response(db_conn,nct_id,version_a, version_b, r)
r = make_request(nct_id, 1, v)
upload_response(db_conn,nct_id,1, v, r)
def step_generator(max_version):
old=3
for i in range(4,max_version,2):
yield (old,i)
old = i + 1
if __name__ == "__main__":
#make request
nct_id = "NCT00658567"
r = make_request(nct_id,1,36)
print(r.url)
print(r.status_code)
v = get_highest_version_number(r)
print("highest version", v )
#db connection
with psyco.connect(dbname="aact_db", user="root", host="localhost", password="root") as con:
with con.cursor() as cursor:
#insert response
download_trial_records(nct_id,cursor)
print("download complete")