Files
Bibmath2Tex/grab.py
2025-04-13 21:06:51 +02:00

81 lines
3.3 KiB
Python

from bs4 import BeautifulSoup
import requests
import urllib.parse as urlparse
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
'accept-language': 'fr-FR,fr;q=0.7',
'cache-control': 'no-cache',
'pragma': 'no-cache',
'priority': 'u=0, i',
'sec-ch-ua': '"Brave";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'sec-gpc': '1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
}
params = {
'action': 'affiche',
'quoi': 'mpsi/feuillesexo/matrices',
'type': 'fexo',
}
response = requests.get('https://bibmath.net/ressources/index.php', headers=headers, params=params)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
article = soup.find('article', id='contenugauche')
if article:
title = article.find('h1').get_text(strip=True)
print(f"Title: {title}")
# Find all part titles
part_titles = article.find_all('div', class_='titrepartie')
for part in part_titles:
part_text = part.get_text(strip=True)
print(f"Part of the File: {part_text}")
exercises = article.find_all('div', class_='exo')
for exo in exercises:
exo_title_div = exo.find('div', class_='titreexo')
exercise_number = exo_title_div.find_all('span')[1].text.strip()
title_start = exo_title_div.text.find(' - ') + 3
title_end = exo_title_div.text.find('[Signaler une erreur]')
exercise_title = exo_title_div.text[title_start:title_end].strip()
error_link = exo_title_div.find('a', href=lambda href: href and 'signalerreur.php' in href)['href']
parsed_url = urlparse.urlparse(error_link)
exercise_id = urlparse.parse_qs(parsed_url.query)['numero'][0]
stars_count = len(exo_title_div.find_all('img'))
enonce = exo.find('div', class_='enonce').find('div', class_='inner').get_text(strip=True)
indication_div = exo.find('div', class_='indication')
indication = (
indication_div.find('div', class_='inner').get_text(strip=True)
if indication_div else "No indication"
)
answer_div = exo.find('div', class_='corrige')
answer = (
answer_div.find('div', class_='inner').get_text(strip=True)
if answer_div else "No answer"
)
# print(f"Exercise Number: {exercise_number}")
# print(f"Exercise Title: {exercise_title}")
# print(f"Exercise ID: {exercise_id}")
# print(f"Stars: {stars_count} stars")
# print(f"Enonce: {enonce}")
# print(f"Indication: {indication}")
# print(f"Answer: {answer}")
else:
print("Article with id 'contenugauche' not found.")
else:
print(f"Request failed with status code: {response.status_code}")