diff --git a/DockerContainers/classifications/classify_terminations.py b/DockerContainers/classifications/classify_terminations.py new file mode 100644 index 0000000..d2a7702 --- /dev/null +++ b/DockerContainers/classifications/classify_terminations.py @@ -0,0 +1,48 @@ +import psycopg2 as psyco +import pandas as pd +import nltk +from nltk.corpus import stopwords +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.cluster import KMeans +import re + + +def preporcess_text(text): + + text = text.lower() + text = re.sub("[^A-Za-z]+", " ", text) + #make tokens + tokens = nltk.word_tokenize(text) + + #remove stopwords + tokens = [ w for w in tokens if not w in stopwords.words("english")] + + #rejoin + return " ".join(tokens).strip() + +if __name__ == "__main__": + conn = psyco.connect(dbname="aact_db", user="analysis", host="localhost", password="test") + + curse = conn.cursor() + + curse.execute("SELECT why_stopped FROM ctgov.studies WHERE why_stopped IS NOT NULL LIMIT 2000;") + results = curse.fetchall() + + curse.close() + conn.close() + + data = pd.DataFrame(results, columns = ["corpus"]) + data["cleaned"] = data.corpus.apply(preporcess_text) + + vectorizer = TfidfVectorizer(sublinear_tf=True) + + X = vectorizer.fit_transform(data.cleaned) + + kmeans = KMeans(n_clusters=3, random_state=11021585) + kmeans.fit(X) + + data["cluster"] = kmeans.labels_ + + print(data.groupby(["cluster"]).count()) + + \ No newline at end of file diff --git a/DockerContainers/downloader/db_connection.py b/DockerContainers/downloader/db_connection.py index e5df6fc..a1da827 100644 --- a/DockerContainers/downloader/db_connection.py +++ b/DockerContainers/downloader/db_connection.py @@ -1,10 +1,10 @@ import psycopg2 as psyco -conn = psyco.connect(dbname="aact_db", user="admin", host="localhost", password="root") +conn = psyco.connect(dbname="aact_db", user="root", host="localhost", password="root") curse = conn.cursor() -curse.execute("SELECT * FROM testing") +curse.execute("SELECT * FROM ctgov.studies LIMIT 2;") print(curse.fetchall()) curse.close()