parser/main/parser/parser.py
2023-09-11 16:49:08 +06:00

49 lines
1.4 KiB
Python

import requests
from bs4 import BeautifulSoup as BS
from django.views.decorators.csrf import csrf_exempt
URL = 'https://www.krsu.edu.kg/'
HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/107.0'
}
@csrf_exempt
def get_html(url, params=''):
req = requests.get(url, headers=HEADERS)
return req
@csrf_exempt
def get_data(html):
soup = BS(html, 'html.parser')
items = soup.find_all('div', class_='news-column-block')
movies = []
for item in items:
movies.append(
{
'link': URL + item.find('a').get('href'),
'title': URL + item.find('a').get('href'),
'title_text': item.find('div', class_='views-field views-field-field-data-publikacii').find(
'div').get_text(),
'image': URL + item.find('div', class_='field-content news-img').find('img').get('src')
}
)
return movies
@csrf_exempt
def parser():
html = get_html(URL)
if html.status_code == 200:
movies1 = []
for page in range(0, 1):
html = get_html(f'https://www.krsu.edu.kg/', params=page)
movies1.extend(get_data(html.text))
return movies1
else:
raise Exception('Error in parser func........')