got connections working, started work on classifying why-ended
parent
f481c45d50
commit
d923c262a4
@ -0,0 +1,48 @@
|
|||||||
|
import psycopg2 as psyco
|
||||||
|
import pandas as pd
|
||||||
|
import nltk
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from sklearn.cluster import KMeans
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
def preporcess_text(text):
|
||||||
|
|
||||||
|
text = text.lower()
|
||||||
|
text = re.sub("[^A-Za-z]+", " ", text)
|
||||||
|
#make tokens
|
||||||
|
tokens = nltk.word_tokenize(text)
|
||||||
|
|
||||||
|
#remove stopwords
|
||||||
|
tokens = [ w for w in tokens if not w in stopwords.words("english")]
|
||||||
|
|
||||||
|
#rejoin
|
||||||
|
return " ".join(tokens).strip()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
conn = psyco.connect(dbname="aact_db", user="analysis", host="localhost", password="test")
|
||||||
|
|
||||||
|
curse = conn.cursor()
|
||||||
|
|
||||||
|
curse.execute("SELECT why_stopped FROM ctgov.studies WHERE why_stopped IS NOT NULL LIMIT 2000;")
|
||||||
|
results = curse.fetchall()
|
||||||
|
|
||||||
|
curse.close()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
data = pd.DataFrame(results, columns = ["corpus"])
|
||||||
|
data["cleaned"] = data.corpus.apply(preporcess_text)
|
||||||
|
|
||||||
|
vectorizer = TfidfVectorizer(sublinear_tf=True)
|
||||||
|
|
||||||
|
X = vectorizer.fit_transform(data.cleaned)
|
||||||
|
|
||||||
|
kmeans = KMeans(n_clusters=3, random_state=11021585)
|
||||||
|
kmeans.fit(X)
|
||||||
|
|
||||||
|
data["cluster"] = kmeans.labels_
|
||||||
|
|
||||||
|
print(data.groupby(["cluster"]).count())
|
||||||
|
|
||||||
|
|
||||||
@ -1,10 +1,10 @@
|
|||||||
import psycopg2 as psyco
|
import psycopg2 as psyco
|
||||||
|
|
||||||
conn = psyco.connect(dbname="aact_db", user="admin", host="localhost", password="root")
|
conn = psyco.connect(dbname="aact_db", user="root", host="localhost", password="root")
|
||||||
|
|
||||||
curse = conn.cursor()
|
curse = conn.cursor()
|
||||||
|
|
||||||
curse.execute("SELECT * FROM testing")
|
curse.execute("SELECT * FROM ctgov.studies LIMIT 2;")
|
||||||
print(curse.fetchall())
|
print(curse.fetchall())
|
||||||
|
|
||||||
curse.close()
|
curse.close()
|
||||||
|
|||||||
Loading…
Reference in New Issue