import requests import psycopg2 as psyco from datetime import datetime from bs4 import BeautifulSoup def get_highest_version_number(response): #navigate to a specific part of the returned html and extract the highest posted version. soup = BeautifulSoup(response.text, features="lxml") version_value = soup.findChildren("fieldset")[0].table.tbody.findChildren("tr")[-1].td.text return int(version_value) def make_request(nct_id,version1,version2): #create url baseurl = "https://clinicaltrials.gov/ct2/history/{}?A={}&B={}&C=Side-by-Side" url = baseurl.format(nct_id,version1,version2) #make request response = requests.get(url) #return the response return response def upload_response(db_conn, nct_id, version_a, version_b, response): timestamp = datetime.strptime(r.headers['date'], "%a, %d %b %Y %H:%M:%S %Z") #this uploads the response values. db_conn.execute(""" INSERT INTO http.responses (nct_id,version_a,version_b,url,response_code,response_date, html) VALUES (%s,%s,%s,%s,%s,%s,%s) RETURNING id ; """ ,(nct_id ,version_a ,version_b ,response.url ,response.status_code ,datetime.isoformat(timestamp) ,response.text)) #FIX: there is an issue with storing the tags for some reason. pk_response = cursor.fetchall() return pk_response def download_trial_records(nct_id, db_conn): #download first 2 records r = make_request(nct_id,1,2) upload_response(db_conn, nct_id,1,2,r) #extract last version v = get_highest_version_number(r) #download the remaining versions if v == 2: return None elif v % 2 == 0: for version_a, version_b in step_generator(v): r = make_request(nct_id, version_a, version_b) upload_response(db_conn,nct_id,version_a, version_b, r) elif v % 2 == 1: for version_a, version_b in step_generator(v): r = make_request(nct_id, version_a, version_b) upload_response(db_conn,nct_id,version_a, version_b, r) r = make_request(nct_id, 1, v) upload_response(db_conn,nct_id,1, v, r) def step_generator(max_version): old=3 for i in range(4,max_version,2): yield (old,i) old = i + 1 if __name__ == "__main__": #make request nct_id = "NCT00658567" r = make_request(nct_id,1,36) print(r.url) print(r.status_code) v = get_highest_version_number(r) print("highest version", v ) #db connection with psyco.connect(dbname="aact_db", user="root", host="localhost", password="root") as con: with con.cursor() as cursor: #insert response download_trial_records(nct_id,cursor) print("download complete")