got connections working, started work on classifying why-ended
parent
f481c45d50
commit
d923c262a4
@ -0,0 +1,48 @@
|
||||
import psycopg2 as psyco
|
||||
import pandas as pd
|
||||
import nltk
|
||||
from nltk.corpus import stopwords
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.cluster import KMeans
|
||||
import re
|
||||
|
||||
|
||||
def preporcess_text(text):
|
||||
|
||||
text = text.lower()
|
||||
text = re.sub("[^A-Za-z]+", " ", text)
|
||||
#make tokens
|
||||
tokens = nltk.word_tokenize(text)
|
||||
|
||||
#remove stopwords
|
||||
tokens = [ w for w in tokens if not w in stopwords.words("english")]
|
||||
|
||||
#rejoin
|
||||
return " ".join(tokens).strip()
|
||||
|
||||
if __name__ == "__main__":
|
||||
conn = psyco.connect(dbname="aact_db", user="analysis", host="localhost", password="test")
|
||||
|
||||
curse = conn.cursor()
|
||||
|
||||
curse.execute("SELECT why_stopped FROM ctgov.studies WHERE why_stopped IS NOT NULL LIMIT 2000;")
|
||||
results = curse.fetchall()
|
||||
|
||||
curse.close()
|
||||
conn.close()
|
||||
|
||||
data = pd.DataFrame(results, columns = ["corpus"])
|
||||
data["cleaned"] = data.corpus.apply(preporcess_text)
|
||||
|
||||
vectorizer = TfidfVectorizer(sublinear_tf=True)
|
||||
|
||||
X = vectorizer.fit_transform(data.cleaned)
|
||||
|
||||
kmeans = KMeans(n_clusters=3, random_state=11021585)
|
||||
kmeans.fit(X)
|
||||
|
||||
data["cluster"] = kmeans.labels_
|
||||
|
||||
print(data.groupby(["cluster"]).count())
|
||||
|
||||
|
||||
@ -1,10 +1,10 @@
|
||||
import psycopg2 as psyco
|
||||
|
||||
conn = psyco.connect(dbname="aact_db", user="admin", host="localhost", password="root")
|
||||
conn = psyco.connect(dbname="aact_db", user="root", host="localhost", password="root")
|
||||
|
||||
curse = conn.cursor()
|
||||
|
||||
curse.execute("SELECT * FROM testing")
|
||||
curse.execute("SELECT * FROM ctgov.studies LIMIT 2;")
|
||||
print(curse.fetchall())
|
||||
|
||||
curse.close()
|
||||
|
||||
Loading…
Reference in New Issue