Fixed version extraction

history-download
will king 4 years ago
parent 9623bc0550
commit bf28ad30b2

@ -6,11 +6,30 @@ from bs4 import BeautifulSoup
from multiprocessing import Pool
def get_highest_version_number(response):
#navigate to a specific part of the returned html and extract the highest posted version.
"""
Navigate to the version table and and extract the highest posted version.
As there are cases where the last element in the table IS NOT a
a version entry, this function iterates from the last row entry to the first,
looking for cells with the correct header, indicating
that it contains version information.
The last one occuring in the unreversed list is what we need.
"""
soup = BeautifulSoup(response.text, features="lxml")
version_value = soup.findChildren("fieldset")[0].table.tbody.findChildren("tr")[-1].td.text
return int(version_value)
#get version table rows
table_rows = soup.findChildren("fieldset")[0].table.tbody.findChildren("tr")
for row in reversed(table_rows):
# if it is <td headers="VersionNumber">xx</td> then it contains what we need.
for td in row.findChildren("td"):
print("\n", td)
if ("headers" in td.attrs) and (td.attrs["headers"][0]=="VersionNumber"):
#Note the use of [0] above. attribute elements are lists.
version_number = int(td.text)
return version_number
def make_request(nct_id,version1,version2):
#create url

Loading…
Cancel
Save