diff --git a/history_downloader/downloader.py b/history_downloader/downloader.py index 8738e67..ab4b363 100644 --- a/history_downloader/downloader.py +++ b/history_downloader/downloader.py @@ -6,14 +6,30 @@ from bs4 import BeautifulSoup from multiprocessing import Pool + def get_highest_version_number(response): """ - Extract the highest version currently available from the version number. + Navigate to the version table and and extract the highest posted version. + + As there are cases where the last element in the table IS NOT a + a version entry, this function iterates from the last row entry to the first, + looking for cells with the correct header, indicating + that it contains version information. + The last one occuring in the unreversed list is what we need. """ - #navigate to a specific part of the returned html and extract the highest posted version. + soup = BeautifulSoup(response.text, features="lxml") - version_value = soup.findChildren("fieldset")[0].table.tbody.findChildren("tr")[-1].td.text - return int(version_value) + #get version table rows + table_rows = soup.findChildren("fieldset")[0].table.tbody.findChildren("tr") + + for row in reversed(table_rows): + # if it is xx then it contains what we need. + for td in row.findChildren("td"): + print("\n", td) + if ("headers" in td.attrs) and (td.attrs["headers"][0]=="VersionNumber"): + #Note the use of [0] above. attribute elements are lists. + version_number = int(td.text) + return version_number def make_request(nct_id,version1,version2): """