You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ClinicalTrialsDataProcessing/classifications/classify_terminations.py

48 lines
1.2 KiB
Python

import psycopg2 as psyco
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import re
def preporcess_text(text):
text = text.lower()
text = re.sub("[^A-Za-z]+", " ", text)
#make tokens
tokens = nltk.word_tokenize(text)
#remove stopwords
tokens = [ w for w in tokens if not w in stopwords.words("english")]
#rejoin
return " ".join(tokens).strip()
if __name__ == "__main__":
conn = psyco.connect(dbname="aact_db", user="analysis", host="localhost", password="test")
curse = conn.cursor()
curse.execute("SELECT why_stopped FROM ctgov.studies WHERE why_stopped IS NOT NULL LIMIT 2000;")
results = curse.fetchall()
curse.close()
conn.close()
data = pd.DataFrame(results, columns = ["corpus"])
data["cleaned"] = data.corpus.apply(preporcess_text)
vectorizer = TfidfVectorizer(sublinear_tf=True)
X = vectorizer.fit_transform(data.cleaned)
kmeans = KMeans(n_clusters=10, random_state=11021585)
kmeans.fit(X)
data["cluster"] = kmeans.labels_
print(data.groupby(["cluster"])["cleaned"].count())