-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.py
118 lines (110 loc) · 3.67 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import requests
import json
import datetime
import time
from helpers import date_handler, default_json_response
from flask import Response
def getColor(string):
colors = {
'student': '#4bbe9d',
'lublin': '#dc462d',
'tv umcs': '#f38200',
'biznes': '#6b08ff',
'pracownik': '#337b93',
'absolwent': '#60baf6',
'kandydat': '#ff6600',
}
try:
picked = colors[string.lower()]
return picked
except:
return '#9399a5'
SMALL_IMAGE = 'c,270,164,t,c'
MEDIUM_IMAGE = 'r,480,360'
BIG_IMAGE = 'r,1024,800'
class Scraper:
def __init__(self, url='https://www.umcs.pl/', timeout=500):
self.url = url
self.timeout = timeout
self.reload_time = 5
self.retries = 3
self.loaded = False
self.last_updated = ''
self.jsonData = json.dumps(default_json_response)
# Run on __init__
self.start()
def response(self):
if self.loaded:
return Response(
response=self.jsonData,
status=200,
mimetype='application/json'
)
else:
return Response(
response=self.jsonData,
status=400,
mimetype='application/json'
)
def _soup(self):
iter = 0
success_load = False
all_news = []
while not success_load and iter <= self.retries:
try:
req = requests.get(self.url)
soup = BeautifulSoup(req.text, 'html.parser')
all_news = soup.find_all('a', class_='box-news')
success_load = True
self.loaded = True
except:
print('Error, retrying {}/{}, waiting {} seconds.'.format(iter, self.retries, self.reload_time))
# Make this async
# Doesn't work on first load
# time.sleep(self.reload_time)
iter += 1
return success_load, all_news
def start(self):
success_response, news = self._soup()
payload_data = self._getItems(news)
if success_response and len(payload_data) > 0:
last_updated = datetime.datetime.now()
data = {
'success': success_response,
'payload': payload_data,
'last_updated': last_updated
}
self.jsonData = json.dumps(data, default=date_handler)
def _getItems(self, news):
itemList = []
try:
for item in news:
# i += 1
# Getting titles
news_title = item.find('h4', {'class':'title'})
news_title = news_title.text.replace('\n', '').strip()
# Getting news type
news_type = item.find('em', {'class':'label-area-A'})
news_type = news_type.text.replace('\n', '').strip()
# Getting image url
news_image = item.find('img', {'class':'img'})
news_image = news_image['src']
# New url
news_hires = news_image[0:25] + BIG_IMAGE + news_image[38:]
# Change News Type to a color for styling
news_color = getColor(news_type)
# Adding scraped data to list
itemList.append({
'url': news_hires,
'color': news_color,
'title': news_title,
'type': news_type
})
except:
return []
return itemList
if __name__ == '__main__':
scrap = Scraper()