from pprint import pprint from bs4 import BeautifulSoup import requests import urllib.parse as urlparse class Exercise: def __init__(self, number, title, id, stars, enonce, indication, answer): self.number = number self.title = title self.id = id self.stars = stars self.enonce = enonce self.indication = indication self.answer = answer def __repr__(self): return f"Exercise({self.number}, {self.title}, {self.id}, {self.stars})" class Part: def __init__(self, title): self.title = title self.exercises = [] def add_exercise(self, exercise): self.exercises.append(exercise) def __repr__(self): return f"Part({self.title}, {len(self.exercises)} exercises)" class Chapitre: def __init__(self, title, url=None): if ':' in title: title = title.split(':')[1].strip() title = title.lower() title = title[0].upper() + title[1:] if title else "" self.title = title self.parts = [] self.url = url def add_part(self, part): self.parts.append(part) def __repr__(self): return f"Chapitre({self.title}, {len(self.parts)} parts)" def show(self): print(f"Chapitre: {self.title}") for part in self.parts: print(f" Part: {part.title}") for exercise in part.exercises: print(f" Exercise: {exercise.number}, {exercise.title}, {exercise.id}, {exercise.stars} stars") print(f" Enonce: {exercise.enonce}") print(f" Indication: {exercise.indication}") print(f" Answer: {exercise.answer}") class LatexFile: def __init__(self, filename): self.filename = filename self.content = "" def add_header(self): with open('parts/header.tex', 'r') as f: self.content += f.read() def add_footer(self): with open('parts/footer.tex', 'r') as f: self.content += f.read() def add_content(self, content): self.content += content + "\n" def add_pagebreak(self): self.content += "\\newpage\n" def add_source(self, chapitre): self.content += f"\\noindent\\textbf{{Chapitre:}} \\href{{{chapitre.url}}}{{{chapitre.title}}}\n" def add_exercise(self, exercise): self.content += f"""\\exercice{{{exercise.id}, name, date, {exercise.stars}, {exercise.title}}}\n""" self.content += f"""\\enonce{{{exercise.id}}}{{}}\n""" self.content += f"{exercise.enonce}\n" self.content += f"""\\finenonce{{{exercise.id}}}\n""" self.content += f"""\\finexercice\n""" self.content += "\n" def add_indication(self, exercise): self.content += f"""\\indication{{{exercise.id}}}\n""" self.content += f"{exercise.indication}\n" self.content += f"""\\finindication\n""" self.content += "\n" def add_answer(self, exercise): self.content += f"""\\correction{{{exercise.id}}}\n""" self.content += f"{exercise.answer}\n" self.content += f"""\\fincorrection\n""" self.content += "\n" def sanitize(self): # return self.content.encode('utf-8', 'ignore').decode('utf-8') return self.content def save(self): with open(self.filename, 'w') as f: f.write(self.sanitize()) def generate_latex(self, chapitre): self.add_header() self.add_source(chapitre) self.add_content(f"\\title{{{chapitre.title}}}") for part in chapitre.parts: self.add_content(f"\\section{{{part.title}}}") for ex in part.exercises: self.add_exercise(ex) self.add_pagebreak() for part in chapitre.parts: for ex in part.exercises: self.add_indication(ex) self.add_pagebreak() for part in chapitre.parts: for ex in part.exercises: self.add_answer(ex) self.add_footer() self.save() def parse(content): soup2 = BeautifulSoup(str(content), 'lxml') for tag in soup2.find_all(['span', 'a', 'img']): tag.extract() questions = [] ol_list = soup2.find('ol', class_='enumeratechiffre') if ol_list: for i, li in enumerate(ol_list.find_all('li'), 1): if i == 1: # Add a line break before the first question questions.append("\\par") questions.append(f"{i}. {li.get_text().strip()}") list_text = "\n\n".join(questions) ol_list.replace_with(list_text) content = soup2.get_text().strip() return content def fetch_chapitre(page): headers = { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8', 'accept-language': 'fr-FR,fr;q=0.7', 'cache-control': 'no-cache', 'pragma': 'no-cache', 'priority': 'u=0, i', 'sec-ch-ua': '"Brave";v="131", "Chromium";v="131", "Not_A Brand";v="24"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"macOS"', 'sec-fetch-dest': 'document', 'sec-fetch-mode': 'navigate', 'sec-fetch-site': 'same-origin', 'sec-fetch-user': '?1', 'sec-gpc': '1', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36', } response = requests.get(page, headers=headers) if response.status_code == 200: soup = BeautifulSoup(response.text, 'lxml') article = soup.find('article', id='contenugauche') if article: title = article.find('h1').get_text(strip=True) chapitre = Chapitre(title, url=response.url) current_part = None for element in article.find_all(recursive=False): if 'titrepartie' in element.get('class', []): part_title = element.get_text(strip=True) current_part = Part(part_title) chapitre.add_part(current_part) elif 'exo' in element.get('class', []) and current_part is not None: exo_title_div = element.find('div', class_='titreexo') exercise_number = exo_title_div.find_all('span')[1].text.strip() title_start = exo_title_div.text.find(' - ') + 3 title_end = exo_title_div.text.find('[Signaler une erreur]') exercise_title = exo_title_div.text[title_start:title_end].strip() error_link = exo_title_div.find('a', href=lambda href: href and 'signalerreur.php' in href)['href'] parsed_url = urlparse.urlparse(error_link) exercise_id = urlparse.parse_qs(parsed_url.query)['numero'][0] stars_count = len(exo_title_div.find_all('img')) enonce = element.find('div', class_='enonce').find('div', class_='inner') enonce = parse(enonce) indication_div = element.find('div', class_='indication') indication = indication_div.find('div', class_='inner') indication = parse(indication) answer_div = element.find('div', class_='corrige') answer = answer_div.find('div', class_='inner') answer = parse(answer) exercise = Exercise( number=exercise_number, title=exercise_title, id=exercise_id, stars=stars_count, enonce=enonce, indication=indication, answer=answer ) current_part.add_exercise(exercise) return chapitre else: print("Article with id 'contenugauche' not found.") else: print(f"Request failed with status code: {response.status_code}") def get_page(page): chapitre = fetch_chapitre(page) title = chapitre.title latex_file = LatexFile(f'dump/{title}.tex') latex_file.generate_latex(chapitre) print(f"LaTeX file generated: dump/{title}.tex") if __name__ == "__main__": # page = "https://bibmath.net/ressources/index.php?action=affiche&quoi=mpsi/feuillesexo/prehilbert&type=fexo" # get_page(page) with open('pages.txt', 'r') as f: for line in f: page = line.strip() get_page(page)