You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ClinicalTrialsDataProcessing/Parser/textprocessing.py

121 lines
5.1 KiB
Python

from cgitb import html
import re
form = """
<tr>
<td colspan="2">
<form id="form_StudyStatus">
<div class="w3-responsive">
<fieldset class="entryReq" id="StudyStatus"
style="margin:auto;margin-bottom:1em;padding-bottom:0.5em;width:98%;">
<legend class="moduleLabel"> <img id="StudyStatusImg" class="toggleImage"
onclick="toggleModule('StudyStatus');" src="html/images/collapse.png"
alt='Open or close this module'>
Study Status</legend>
<div id="StudyStatusBody" class="moduleBody">
<table class="indent1 moduleTable resultTable">
<thead>
<tr>
<th style="width:210px;"></th>
<th></th>
</tr>
</thead>
<tbody>
<tr>
<td class="rowLabel" style="min-width: 210px;">Record Verification:</td>
<td>April <span class="drop_hilite">2008</span> <span class="add_hilite">2017</span> </td>
</tr>
<tr>
<td class="rowLabel" style="min-width: 210px;">Overall Status:</td>
<td><span class="drop_hilite">Recruiting</span> <span class="add_hilite">Completed</span></td>
</tr>
<tr>
<td class="rowLabel" style="min-width: 210px;">Study Start:</td>
<td>March 2008 </td>
</tr>
<tr>
<td class="rowLabel" style="min-width: 210px;">Primary Completion:</td>
<td> <span class="add_hilite">December 2009 [Actual]</span> </td>
</tr>
<tr>
<td class="rowLabel" style="min-width: 210px;">Study Completion:</td>
<td>December 2009 [ <span class="drop_hilite">Anticipated</span> <span
class="add_hilite">Actual</span>] </td>
</tr>
<tr style="border-bottom:1px solid lightgray">
<td colspan="3"></td>
</tr>
<tr>
<td class="rowLabel" style="min-width: 210px;">First Submitted:</td>
<td>April 10, 2008 </td>
</tr>
<tr>
<td class="rowLabel" style="min-width: 210px;">First Submitted that<br />Met QC Criteria:</td>
<td>April 10, 2008 </td>
</tr>
<tr>
<td class="rowLabel" style="min-width: 210px;">First Posted:</td>
<td>April 15, 2008 [Estimate] </td>
</tr>
<tr style="border-bottom:1px solid lightgray">
<td colspan="3"></td>
</tr>
<tr>
<td class="rowLabel" style="min-width: 210px;">Results First Submitted:</td>
<td> <span class="add_hilite">February 6, 2014</span> </td>
</tr>
<tr>
<td class="rowLabel" style="min-width: 210px;">Results First Submitted that<br />Met QC
Criteria:</td>
<td> <span class="add_hilite">August 29, 2014</span> </td>
</tr>
<tr>
<td class="rowLabel" style="min-width: 210px;">Results First Posted:</td>
<td> <span class="add_hilite">September 9, 2014 [Estimate]</span> </td>
</tr>
<tr style="border-bottom:1px solid lightgray">
<td colspan="3"></td>
</tr>
<tr>
<td class="rowLabel" style="min-width: 210px;">Last Update Submitted that<br />Met QC Criteria:
</td>
<td>April <span class="drop_hilite">10</span> <span class="add_hilite">18</span>, <span
class="drop_hilite">2008</span> <span class="add_hilite">2017</span> </td>
</tr>
<tr>
<td class="rowLabel" style="min-width: 210px;">Last Update Posted:</td>
<td><span class="drop_hilite">April 15, 2008 [Estimate]</span> <span class="add_hilite">May 19,
2017 [Actual]</span> </td>
</tr>
</tbody>
</table>
</div>
</fieldset>
</div>
</form>
</td>
</tr>
"""
entry1 = """
<tr>
<td class="rowLabel" style="min-width: 210px;">Record Verification:</td>
<td>April <span class="drop_hilite">2008</span> <span class="add_hilite">2017</span> </td>
</tr>
"""
drop_old_re = re.compile('<span class="drop_hilite">\w*</span>\s?')
drop_new_re = re.compile('<span class="add_hilite">\w*</span>\s?')
drop_tags_re = re.compile('<[=-_,.:;"/\w\s]+>')
print(drop_new_re.sub("",entry1))
print(drop_old_re.sub("",entry1))
print(drop_tags_re.sub("",entry1))
print(drop_tags_re.sub("",drop_new_re.sub("",entry1)))
print(drop_tags_re.sub("",drop_new_re.sub("",form)))