have the basics of a downloader and the sql to support it.
parent
8fbe2c94e9
commit
725839df5b
@ -0,0 +1,96 @@
|
|||||||
|
import requests
|
||||||
|
import psycopg2 as psyco
|
||||||
|
from datetime import datetime
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
|
def get_highest_version_number(response):
|
||||||
|
#navigate to a specific part of the returned html and extract the highest posted version.
|
||||||
|
soup = BeautifulSoup(response.text, features="lxml")
|
||||||
|
version_value = soup.findChildren("fieldset")[0].table.tbody.findChildren("tr")[-1].td.text
|
||||||
|
return int(version_value)
|
||||||
|
|
||||||
|
def make_request(nct_id,version1,version2):
|
||||||
|
#create url
|
||||||
|
baseurl = "https://clinicaltrials.gov/ct2/history/{}?A={}&B={}&C=Side-by-Side"
|
||||||
|
url = baseurl.format(nct_id,version1,version2)
|
||||||
|
|
||||||
|
#make request
|
||||||
|
response = requests.get(url)
|
||||||
|
|
||||||
|
#return the response
|
||||||
|
return response
|
||||||
|
|
||||||
|
def upload_response(db_conn, nct_id, version_a, version_b, response):
|
||||||
|
|
||||||
|
timestamp = datetime.strptime(r.headers['date'], "%a, %d %b %Y %H:%M:%S %Z")
|
||||||
|
|
||||||
|
#this uploads the response values.
|
||||||
|
db_conn.execute("""
|
||||||
|
INSERT INTO http.responses
|
||||||
|
(nct_id,version_a,version_b,url,response_code,response_date, html)
|
||||||
|
VALUES (%s,%s,%s,%s,%s,%s,%s)
|
||||||
|
RETURNING id
|
||||||
|
;
|
||||||
|
"""
|
||||||
|
,(nct_id
|
||||||
|
,version_a
|
||||||
|
,version_b
|
||||||
|
,response.url
|
||||||
|
,response.status_code
|
||||||
|
,datetime.isoformat(timestamp)
|
||||||
|
,response.text)) #FIX: there is an issue with storing the tags for some reason.
|
||||||
|
|
||||||
|
pk_response = cursor.fetchall()
|
||||||
|
|
||||||
|
return pk_response
|
||||||
|
|
||||||
|
|
||||||
|
def download_trial_records(nct_id, db_conn):
|
||||||
|
#download first 2 records
|
||||||
|
r = make_request(nct_id,1,2)
|
||||||
|
upload_response(db_conn, nct_id,1,2,r)
|
||||||
|
|
||||||
|
#extract last version
|
||||||
|
v = get_highest_version_number(r)
|
||||||
|
|
||||||
|
#download the remaining versions
|
||||||
|
if v == 2:
|
||||||
|
return None
|
||||||
|
elif v % 2 == 0:
|
||||||
|
for version_a, version_b in step_generator(v):
|
||||||
|
r = make_request(nct_id, version_a, version_b)
|
||||||
|
upload_response(db_conn,nct_id,version_a, version_b, r)
|
||||||
|
|
||||||
|
elif v % 2 == 1:
|
||||||
|
for version_a, version_b in step_generator(v):
|
||||||
|
r = make_request(nct_id, version_a, version_b)
|
||||||
|
upload_response(db_conn,nct_id,version_a, version_b, r)
|
||||||
|
|
||||||
|
r = make_request(nct_id, 1, v)
|
||||||
|
upload_response(db_conn,nct_id,1, v, r)
|
||||||
|
|
||||||
|
def step_generator(max_version):
|
||||||
|
old=3
|
||||||
|
for i in range(4,max_version,2):
|
||||||
|
yield (old,i)
|
||||||
|
old = i + 1
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
#make request
|
||||||
|
nct_id = "NCT00658567"
|
||||||
|
r = make_request(nct_id,1,36)
|
||||||
|
print(r.url)
|
||||||
|
print(r.status_code)
|
||||||
|
|
||||||
|
v = get_highest_version_number(r)
|
||||||
|
print("highest version", v )
|
||||||
|
|
||||||
|
#db connection
|
||||||
|
with psyco.connect(dbname="aact_db", user="root", host="localhost", password="root") as con:
|
||||||
|
with con.cursor() as cursor:
|
||||||
|
#insert response
|
||||||
|
|
||||||
|
download_trial_records(nct_id,cursor)
|
||||||
|
print("download complete")
|
||||||
Loading…
Reference in New Issue