Fixed version extraction

4 years ago · bf28ad30b2
parent 9623bc0550
commit bf28ad30b2
1 changed files with 22 additions and 3 deletions
--- a/history_downloader/downloader.py
+++ b/history_downloader/downloader.py
@ -6,11 +6,30 @@ from bs4 import BeautifulSoup

 from multiprocessing import Pool

+
 def get_highest_version_number(response):
-    #navigate to a specific part of the returned html and extract the highest posted version.
+    """
+    Navigate to the version table and and extract the highest posted version.
+
+    As there are cases where the last element in the table IS NOT a
+    a version entry, this function iterates from the last row entry to the first,
+    looking for cells with the correct header, indicating 
+    that it contains version information.
+    The last one occuring in the unreversed list is what we need.
+    """
+
    soup = BeautifulSoup(response.text, features="lxml")
-    version_value = soup.findChildren("fieldset")[0].table.tbody.findChildren("tr")[-1].td.text
-    return int(version_value)
+    #get version table rows
+    table_rows = soup.findChildren("fieldset")[0].table.tbody.findChildren("tr")
+
+    for row in reversed(table_rows):
+        # if it is <td headers="VersionNumber">xx</td> then it contains what we need.
+        for td in row.findChildren("td"):
+            print("\n", td)
+            if ("headers" in td.attrs) and (td.attrs["headers"][0]=="VersionNumber"):
+                #Note the use of [0] above. attribute elements are lists.
+                version_number = int(td.text)
+                return version_number

 def make_request(nct_id,version1,version2):
    #create url