got connections working, started work on classifying why-ended

history-download
youainti 4 years ago
parent f481c45d50
commit d923c262a4

@ -0,0 +1,48 @@
import psycopg2 as psyco
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import re
def preporcess_text(text):
text = text.lower()
text = re.sub("[^A-Za-z]+", " ", text)
#make tokens
tokens = nltk.word_tokenize(text)
#remove stopwords
tokens = [ w for w in tokens if not w in stopwords.words("english")]
#rejoin
return " ".join(tokens).strip()
if __name__ == "__main__":
conn = psyco.connect(dbname="aact_db", user="analysis", host="localhost", password="test")
curse = conn.cursor()
curse.execute("SELECT why_stopped FROM ctgov.studies WHERE why_stopped IS NOT NULL LIMIT 2000;")
results = curse.fetchall()
curse.close()
conn.close()
data = pd.DataFrame(results, columns = ["corpus"])
data["cleaned"] = data.corpus.apply(preporcess_text)
vectorizer = TfidfVectorizer(sublinear_tf=True)
X = vectorizer.fit_transform(data.cleaned)
kmeans = KMeans(n_clusters=3, random_state=11021585)
kmeans.fit(X)
data["cluster"] = kmeans.labels_
print(data.groupby(["cluster"]).count())

@ -1,10 +1,10 @@
import psycopg2 as psyco
conn = psyco.connect(dbname="aact_db", user="admin", host="localhost", password="root")
conn = psyco.connect(dbname="aact_db", user="root", host="localhost", password="root")
curse = conn.cursor()
curse.execute("SELECT * FROM testing")
curse.execute("SELECT * FROM ctgov.studies LIMIT 2;")
print(curse.fetchall())
curse.close()

Loading…
Cancel
Save