-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnyt_api.py
69 lines (44 loc) · 2.01 KB
/
nyt_api.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import requests
import nltk
import pprint
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from wordcloud import WordCloud
import matplotlib.pyplot as plt
options = [ 'arts', 'automobiles', 'books', 'business',
'fashion', 'food', 'health', 'home',
'insider', 'magazine', 'movies', 'nyregion',
'obituaries', 'opinion', 'politics', 'realestate',
'science', 'sports', 'sundayreview', 'technology',
'theater', 't-magazine', 'travel', 'upshot',
'us', 'world']
nyt_top_base = "https://api.nytimes.com/svc/topstories/v2/"
nyt_popular_base = "https://api.nytimes.com/svc/mostpopular/v2/"
def gen_top_url (subj, key, base = nyt_top_base):
return base + subj + ".json?api-key=" + key
def gen_popular_url (share_method, timeframe, key, base = nyt_popular_base):
return base + share_method + "/" + str(timeframe) + ".json?api-key=" + key
nltk.download("punkt")
nltk.download("stopwords")
# now tokenize and all that
def compile_dict_list_fields (dlist, field):
return [entry[field] for entry in dlist]
def clean_tokens(text, stopwords):
return [word.lower() for word in word_tokenize(text) if word.isalpha() and word.lower() not in stopwords]
def tokenize_topic_abstracts (url, stopwords = stopwords.words("english")):
response = requests.get(url).json()
article_list = response["results"]
full_text = ' '.join(compile_dict_list_fields(article_list, field = "abstract"))
cleaned_list = clean_tokens(full_text, stopwords)
return cleaned_list
def wc_of_abstracts (url, stopwords = stopwords.words("english")):
cleaned_list = tokenize_topic_abstracts(url, stopwords)
clean_text = ' '.join(cleaned_list)
wc = WordCloud().generate(clean_text)
return wc
def freqdist_of_abstracts (url, stopwords = stopwords.words("english")):
cleaned_list = tokenize_topic_abstracts(url, stopwords)
fdist = FreqDist(cleaned_list)
return fdist