You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
48 lines
1.2 KiB
Python
48 lines
1.2 KiB
Python
import psycopg2 as psyco
|
|
import pandas as pd
|
|
import nltk
|
|
from nltk.corpus import stopwords
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.cluster import KMeans
|
|
import re
|
|
|
|
|
|
def preporcess_text(text):
|
|
|
|
text = text.lower()
|
|
text = re.sub("[^A-Za-z]+", " ", text)
|
|
#make tokens
|
|
tokens = nltk.word_tokenize(text)
|
|
|
|
#remove stopwords
|
|
tokens = [ w for w in tokens if not w in stopwords.words("english")]
|
|
|
|
#rejoin
|
|
return " ".join(tokens).strip()
|
|
|
|
if __name__ == "__main__":
|
|
conn = psyco.connect(dbname="aact_db", user="analysis", host="localhost", password="test")
|
|
|
|
curse = conn.cursor()
|
|
|
|
curse.execute("SELECT why_stopped FROM ctgov.studies WHERE why_stopped IS NOT NULL LIMIT 2000;")
|
|
results = curse.fetchall()
|
|
|
|
curse.close()
|
|
conn.close()
|
|
|
|
data = pd.DataFrame(results, columns = ["corpus"])
|
|
data["cleaned"] = data.corpus.apply(preporcess_text)
|
|
|
|
vectorizer = TfidfVectorizer(sublinear_tf=True)
|
|
|
|
X = vectorizer.fit_transform(data.cleaned)
|
|
|
|
kmeans = KMeans(n_clusters=10, random_state=11021585)
|
|
kmeans.fit(X)
|
|
|
|
data["cluster"] = kmeans.labels_
|
|
|
|
print(data.groupby(["cluster"])["cleaned"].count())
|
|
|
|
|