import psycopg2 as psyco import pandas as pd import nltk from nltk.corpus import stopwords from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans import re def preporcess_text(text): text = text.lower() text = re.sub("[^A-Za-z]+", " ", text) #make tokens tokens = nltk.word_tokenize(text) #remove stopwords tokens = [ w for w in tokens if not w in stopwords.words("english")] #rejoin return " ".join(tokens).strip() if __name__ == "__main__": conn = psyco.connect(dbname="aact_db", user="analysis", host="localhost", password="test") curse = conn.cursor() curse.execute("SELECT why_stopped FROM ctgov.studies WHERE why_stopped IS NOT NULL LIMIT 2000;") results = curse.fetchall() curse.close() conn.close() data = pd.DataFrame(results, columns = ["corpus"]) data["cleaned"] = data.corpus.apply(preporcess_text) vectorizer = TfidfVectorizer(sublinear_tf=True) X = vectorizer.fit_transform(data.cleaned) kmeans = KMeans(n_clusters=10, random_state=11021585) kmeans.fit(X) data["cluster"] = kmeans.labels_ print(data.groupby(["cluster"])["cleaned"].count())