have the basics of a downloader and the sql to support it.

history-download
youainti 4 years ago
parent 8fbe2c94e9
commit 725839df5b

@ -22,20 +22,13 @@ As not every request will have an xml doc, split them.
CREATE TABLE IF NOT EXISTS http.responses ( CREATE TABLE IF NOT EXISTS http.responses (
id SERIAL PRIMARY KEY, id SERIAL PRIMARY KEY,
nct VARCHAR(15), nct_id VARCHAR(15),
version SMALLINT, version_a SMALLINT,
version_b SMALLINT,
url VARCHAR(255), url VARCHAR(255),
response_code SMALLINT, response_code SMALLINT,
response_date DATE response_date TIMESTAMP WITH TIME ZONE,
html TEXT
); );
CREATE TABLE IF NOT EXISTS http.xml_documents (
id SERIAL PRIMARY KEY,
xml XML,
CONSTRAINT http_response
FOREIGN KEY (id)
REFERENCES http.responses (id)
ON DELETE CASCADE --remove xml if the request is deleted
);

@ -4,8 +4,8 @@ conn = psyco.connect(dbname="aact_db", user="root", host="localhost", password="
curse = conn.cursor() curse = conn.cursor()
curse.execute("SELECT * FROM ctgov.studies LIMIT 2;") curse.execute("select * FROM http.responses")
print(curse.fetchall()) print(curse.fetchall())
curse.close() curse.close()
conn.close() conn.close()

@ -0,0 +1,96 @@
import requests
import psycopg2 as psyco
from datetime import datetime
from bs4 import BeautifulSoup
def get_highest_version_number(response):
#navigate to a specific part of the returned html and extract the highest posted version.
soup = BeautifulSoup(response.text, features="lxml")
version_value = soup.findChildren("fieldset")[0].table.tbody.findChildren("tr")[-1].td.text
return int(version_value)
def make_request(nct_id,version1,version2):
#create url
baseurl = "https://clinicaltrials.gov/ct2/history/{}?A={}&B={}&C=Side-by-Side"
url = baseurl.format(nct_id,version1,version2)
#make request
response = requests.get(url)
#return the response
return response
def upload_response(db_conn, nct_id, version_a, version_b, response):
timestamp = datetime.strptime(r.headers['date'], "%a, %d %b %Y %H:%M:%S %Z")
#this uploads the response values.
db_conn.execute("""
INSERT INTO http.responses
(nct_id,version_a,version_b,url,response_code,response_date, html)
VALUES (%s,%s,%s,%s,%s,%s,%s)
RETURNING id
;
"""
,(nct_id
,version_a
,version_b
,response.url
,response.status_code
,datetime.isoformat(timestamp)
,response.text)) #FIX: there is an issue with storing the tags for some reason.
pk_response = cursor.fetchall()
return pk_response
def download_trial_records(nct_id, db_conn):
#download first 2 records
r = make_request(nct_id,1,2)
upload_response(db_conn, nct_id,1,2,r)
#extract last version
v = get_highest_version_number(r)
#download the remaining versions
if v == 2:
return None
elif v % 2 == 0:
for version_a, version_b in step_generator(v):
r = make_request(nct_id, version_a, version_b)
upload_response(db_conn,nct_id,version_a, version_b, r)
elif v % 2 == 1:
for version_a, version_b in step_generator(v):
r = make_request(nct_id, version_a, version_b)
upload_response(db_conn,nct_id,version_a, version_b, r)
r = make_request(nct_id, 1, v)
upload_response(db_conn,nct_id,1, v, r)
def step_generator(max_version):
old=3
for i in range(4,max_version,2):
yield (old,i)
old = i + 1
if __name__ == "__main__":
#make request
nct_id = "NCT00658567"
r = make_request(nct_id,1,36)
print(r.url)
print(r.status_code)
v = get_highest_version_number(r)
print("highest version", v )
#db connection
with psyco.connect(dbname="aact_db", user="root", host="localhost", password="root") as con:
with con.cursor() as cursor:
#insert response
download_trial_records(nct_id,cursor)
print("download complete")
Loading…
Cancel
Save