This repository was archived by the owner on Jan 12, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgenerate.py
101 lines (78 loc) · 2.07 KB
/
generate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import MySQLdb
import MySQLdb.cursors
import nltk
import string
import csv
import threading
import pickle
import config
issues = [
"depressed",
"suicide",
"self_harm",
"family",
"stress",
"relationship",
"isolated",
"anxiety",
"friend",
"school",
"abuse",
"substance",
"bereavement",
"bully",
"medical",
"sexual_abuse",
"lgbtq",
"eating",
"3rd_party",
"other"
]
query = ''
with open('query_messages.txt', 'r') as file:
query = file.read()
def freqForIssue(issue, results):
db = MySQLdb.connect(
host=config.db_host,
user=config.db_user,
passwd=config.db_pass,
db=config.db_name,
cursorclass=MySQLdb.cursors.SSCursor
)
cursor = db.cursor()
sql = query.format(issue)
cursor.execute(sql)
text = ''
for value in cursor:
value = value[0].translate(string.maketrans("",""), string.punctuation).lower()
text += ' ' + value
text = text.split()
freq = nltk.FreqDist(text)
#bigrams = nltk.bigrams(text)
#bigram_freq = nltk.FreqDist(bigrams)
#freq = freq.items() + [(' '.join(i[0]), i[1]) for i in bigram_freq.items()]
#freq = sorted(freq, reverse=True, key=lambda word_freq: word_freq[1])
freq = freq.items()[:5000]
results[issue] = freq
def getPickleFrequencies():
return pickle.load(open('save.p', 'r'))
def doAnalysis(frequencies):
threads = []
for issue in issues:
t = threading.Thread(target=freqForIssue, args=(issue,frequencies,))
threads.append(t)
[t.start() for t in threads]
[t.join() for t in threads]
#frequencies = {}
#doAnalysis(frequencies)
#pickle.dump(frequencies, open('save.p', 'wb'))
frequencies = getPickleFrequencies()
sets = []
for issue in issues:
sets.append(set( [freq[0] for freq in frequencies[issue][:100]] ))
intersection = set.intersection(*sets)
stopwords = nltk.corpus.stopwords.words('english') + list(intersection)
for issue in frequencies:
writer = csv.writer(open('output/' + issue + '.csv', 'wb'), quoting=csv.QUOTE_MINIMAL)
writer.writerows([frequency for frequency in frequencies[issue] if frequency[0] not in stopwords])
print intersection