added feature to extract start_date. deleted unused files and modified others for ease of use.
parent
453e82974e
commit
9d5a726494
File diff suppressed because it is too large
Load Diff
@ -1,184 +0,0 @@
|
|||||||
from copy import copy
|
|
||||||
from datetime import datetime
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
import re
|
|
||||||
|
|
||||||
form = """
|
|
||||||
<tr>
|
|
||||||
<td colspan="2">
|
|
||||||
<form id="form_StudyStatus">
|
|
||||||
<div class="w3-responsive">
|
|
||||||
<fieldset class="entryReq" id="StudyStatus"
|
|
||||||
style="margin:auto;margin-bottom:1em;padding-bottom:0.5em;width:98%;">
|
|
||||||
<legend class="moduleLabel"> <img id="StudyStatusImg" class="toggleImage"
|
|
||||||
onclick="toggleModule('StudyStatus');" src="html/images/collapse.png"
|
|
||||||
alt='Open or close this module'>
|
|
||||||
Study Status</legend>
|
|
||||||
<div id="StudyStatusBody" class="moduleBody">
|
|
||||||
<table class="indent1 moduleTable resultTable">
|
|
||||||
<thead>
|
|
||||||
<tr>
|
|
||||||
<th style="width:210px;"></th>
|
|
||||||
<th></th>
|
|
||||||
</tr>
|
|
||||||
</thead>
|
|
||||||
<tbody>
|
|
||||||
<tr>
|
|
||||||
<td class="rowLabel" style="min-width: 210px;">Record Verification:</td>
|
|
||||||
<td>April <span class="drop_hilite">2008</span> <span class="add_hilite">2017</span> </td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td class="rowLabel" style="min-width: 210px;">Overall Status:</td>
|
|
||||||
<td><span class="drop_hilite">Recruiting</span> <span class="add_hilite">Completed</span></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td class="rowLabel" style="min-width: 210px;">Study Start:</td>
|
|
||||||
<td>March 2008 </td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td class="rowLabel" style="min-width: 210px;">Primary Completion:</td>
|
|
||||||
<td> <span class="add_hilite">December 2009 [Actual]</span> </td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td class="rowLabel" style="min-width: 210px;">Study Completion:</td>
|
|
||||||
<td>December 2009 [ <span class="drop_hilite">Anticipated</span> <span
|
|
||||||
class="add_hilite">Actual</span>] </td>
|
|
||||||
</tr>
|
|
||||||
<tr style="border-bottom:1px solid lightgray">
|
|
||||||
<td colspan="3"></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td class="rowLabel" style="min-width: 210px;">First Submitted:</td>
|
|
||||||
<td>April 10, 2008 </td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td class="rowLabel" style="min-width: 210px;">First Submitted that<br />Met QC Criteria:</td>
|
|
||||||
<td>April 10, 2008 </td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td class="rowLabel" style="min-width: 210px;">First Posted:</td>
|
|
||||||
<td>April 15, 2008 [Estimate] </td>
|
|
||||||
</tr>
|
|
||||||
<tr style="border-bottom:1px solid lightgray">
|
|
||||||
<td colspan="3"></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td class="rowLabel" style="min-width: 210px;">Results First Submitted:</td>
|
|
||||||
<td> <span class="add_hilite">February 6, 2014</span> </td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td class="rowLabel" style="min-width: 210px;">Results First Submitted that<br />Met QC
|
|
||||||
Criteria:</td>
|
|
||||||
<td> <span class="add_hilite">August 29, 2014</span> </td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td class="rowLabel" style="min-width: 210px;">Results First Posted:</td>
|
|
||||||
<td> <span class="add_hilite">September 9, 2014 [Estimate]</span> </td>
|
|
||||||
</tr>
|
|
||||||
<tr style="border-bottom:1px solid lightgray">
|
|
||||||
<td colspan="3"></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td class="rowLabel" style="min-width: 210px;">Last Update Submitted that<br />Met QC Criteria:
|
|
||||||
</td>
|
|
||||||
<td>April <span class="drop_hilite">10</span> <span class="add_hilite">18</span>, <span
|
|
||||||
class="drop_hilite">2008</span> <span class="add_hilite">2017</span> </td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td class="rowLabel" style="min-width: 210px;">Last Update Posted:</td>
|
|
||||||
<td><span class="drop_hilite">April 15, 2008 [Estimate]</span> <span class="add_hilite">May 19,
|
|
||||||
2017 [Actual]</span> </td>
|
|
||||||
</tr>
|
|
||||||
</tbody>
|
|
||||||
</table>
|
|
||||||
</div>
|
|
||||||
</fieldset>
|
|
||||||
</div>
|
|
||||||
</form>
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
entry1 = """
|
|
||||||
<tr>
|
|
||||||
<td class="rowLabel" style="min-width: 210px;">Record Verification:</td>
|
|
||||||
<td>April <span class="drop_hilite">2008</span> <span class="add_hilite">2017</span> </td>
|
|
||||||
</tr>
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
entry2 = '<td> <span class="add_hilite">December 2009 [Actual]</span> </td>'
|
|
||||||
|
|
||||||
DROP_HILITE_re = re.compile('<span class="drop_hilite">[\[\]\w]*</span>\s?')
|
|
||||||
ADD_HILITE_re = re.compile('<span class="add_hilite">\w*</span>\s?')
|
|
||||||
TAGS_RE = re.compile('<[=-_,.:;"/\w\s]+>')
|
|
||||||
|
|
||||||
def extract_new_data(td):
|
|
||||||
text = td.__str__()
|
|
||||||
return TAGS_RE.sub("",DROP_HILITE_re.sub(" ",text)).strip()
|
|
||||||
|
|
||||||
def extract_old_data(td):
|
|
||||||
text = td.__str__()
|
|
||||||
return TAGS_RE.sub("",ADD_HILITE_re.sub(" ",text)).strip()
|
|
||||||
|
|
||||||
def delete_tags(td):
|
|
||||||
text = td.__str__()
|
|
||||||
return TAGS_RE.sub(" ",text).strip()
|
|
||||||
|
|
||||||
|
|
||||||
def extract_date_and_tag(text, date_format):
|
|
||||||
"""
|
|
||||||
Extracts a datetype according to the date format
|
|
||||||
and the estimate tag based on
|
|
||||||
|
|
||||||
"""
|
|
||||||
if not text:
|
|
||||||
return " "
|
|
||||||
|
|
||||||
date_split = text.split("[")
|
|
||||||
if len(date_split) > 1:
|
|
||||||
estimate_tag = date_split[1].split("]")[0].strip()
|
|
||||||
else:
|
|
||||||
estimate_tag = None
|
|
||||||
date_object = datetime.strptime(date_split[0].strip(), date_format)
|
|
||||||
|
|
||||||
return estimate_tag, date_object
|
|
||||||
#TODO: Write test
|
|
||||||
|
|
||||||
def extract_text_and_tag(text):
|
|
||||||
"""
|
|
||||||
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
Entry = BeautifulSoup(entry1, "lxml")
|
|
||||||
Form = BeautifulSoup(form, "lxml")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
print(extract_new_data(Entry.find_all("td")[1]))
|
|
||||||
print(extract_old_data(Entry.find_all("td")[1]))
|
|
||||||
|
|
||||||
for tr in Form.find_all("tr"):
|
|
||||||
data = tr.find_all("td")
|
|
||||||
match len(data):
|
|
||||||
case 0: print("no data")
|
|
||||||
case 1: print("1\t",data[0])
|
|
||||||
case _: print(len(data), "\t", extract_new_data(data[1]) ,"\t|\t", extract_old_data(data[1]))
|
|
||||||
|
|
||||||
#print(extract_date_and_tag(extract_old_data(Entry.find_all("td")[1]), "%B %Y"))
|
|
||||||
print(extract_date_and_tag("April 2008 [ test ]", "%B %Y"))
|
|
||||||
|
|
||||||
|
|
||||||
Entry2 = BeautifulSoup(entry2,"lxml")
|
|
||||||
print(extract_old_data(Entry2)) #error here.
|
|
||||||
print(extract_new_data(Entry2))
|
|
||||||
|
|
||||||
|
|
||||||
Entry3 = copy(Entry2)
|
|
||||||
print(Entry3)
|
|
||||||
Entry4 = Entry3.find_all(class_="add_hilite")[0].extract()
|
|
||||||
print(Entry3.text)
|
|
||||||
print(Entry4.text)
|
|
||||||
Loading…
Reference in New Issue