From bf28ad30b29a8902d827372beab2d1eeb54b42f6 Mon Sep 17 00:00:00 2001 From: will king Date: Fri, 3 Jun 2022 12:11:47 -0700 Subject: [PATCH] Fixed version extraction --- history_downloader/downloader.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/history_downloader/downloader.py b/history_downloader/downloader.py index 3d5168d..c680685 100644 --- a/history_downloader/downloader.py +++ b/history_downloader/downloader.py @@ -6,11 +6,30 @@ from bs4 import BeautifulSoup from multiprocessing import Pool + def get_highest_version_number(response): - #navigate to a specific part of the returned html and extract the highest posted version. + """ + Navigate to the version table and and extract the highest posted version. + + As there are cases where the last element in the table IS NOT a + a version entry, this function iterates from the last row entry to the first, + looking for cells with the correct header, indicating + that it contains version information. + The last one occuring in the unreversed list is what we need. + """ + soup = BeautifulSoup(response.text, features="lxml") - version_value = soup.findChildren("fieldset")[0].table.tbody.findChildren("tr")[-1].td.text - return int(version_value) + #get version table rows + table_rows = soup.findChildren("fieldset")[0].table.tbody.findChildren("tr") + + for row in reversed(table_rows): + # if it is xx then it contains what we need. + for td in row.findChildren("td"): + print("\n", td) + if ("headers" in td.attrs) and (td.attrs["headers"][0]=="VersionNumber"): + #Note the use of [0] above. attribute elements are lists. + version_number = int(td.text) + return version_number def make_request(nct_id,version1,version2): #create url