mirror of
https://github.com/NohamR/Bibmath2Tex.git
synced 2026-05-24 19:58:43 +00:00
first crawl
This commit is contained in:
5
Makefile
5
Makefile
@@ -1,11 +1,10 @@
|
||||
TEX=pdflatex
|
||||
SRC=essaie.tex
|
||||
SRC=file.tex
|
||||
OUTDIR=output
|
||||
OUTPDF=$(OUTDIR)/essaie.pdf
|
||||
|
||||
all:
|
||||
@mkdir -p $(OUTDIR)
|
||||
$(TEX) -output-directory=$(OUTDIR) $(SRC)
|
||||
$(TEX) -output-directory=$(OUTDIR) $(SRC) -interaction=batchmode
|
||||
|
||||
clean:
|
||||
rm -rf $(OUTDIR)/*.aux $(OUTDIR)/*.log $(OUTDIR)/*.pdf
|
||||
|
||||
266
grab.py
266
grab.py
@@ -1,81 +1,213 @@
|
||||
from pprint import pprint
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
import urllib.parse as urlparse
|
||||
|
||||
headers = {
|
||||
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
||||
'accept-language': 'fr-FR,fr;q=0.7',
|
||||
'cache-control': 'no-cache',
|
||||
'pragma': 'no-cache',
|
||||
'priority': 'u=0, i',
|
||||
'sec-ch-ua': '"Brave";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
|
||||
'sec-ch-ua-mobile': '?0',
|
||||
'sec-ch-ua-platform': '"macOS"',
|
||||
'sec-fetch-dest': 'document',
|
||||
'sec-fetch-mode': 'navigate',
|
||||
'sec-fetch-site': 'same-origin',
|
||||
'sec-fetch-user': '?1',
|
||||
'sec-gpc': '1',
|
||||
'upgrade-insecure-requests': '1',
|
||||
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
||||
}
|
||||
class Exercise:
|
||||
def __init__(self, number, title, id, stars, enonce, indication, answer):
|
||||
self.number = number
|
||||
self.title = title
|
||||
self.id = id
|
||||
self.stars = stars
|
||||
self.enonce = enonce
|
||||
self.indication = indication
|
||||
self.answer = answer
|
||||
|
||||
params = {
|
||||
'action': 'affiche',
|
||||
'quoi': 'mpsi/feuillesexo/matrices',
|
||||
'type': 'fexo',
|
||||
}
|
||||
def __repr__(self):
|
||||
return f"Exercise({self.number}, {self.title}, {self.id}, {self.stars})"
|
||||
|
||||
class Part:
|
||||
def __init__(self, title):
|
||||
self.title = title
|
||||
self.exercises = []
|
||||
|
||||
response = requests.get('https://bibmath.net/ressources/index.php', headers=headers, params=params)
|
||||
def add_exercise(self, exercise):
|
||||
self.exercises.append(exercise)
|
||||
|
||||
if response.status_code == 200:
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
article = soup.find('article', id='contenugauche')
|
||||
if article:
|
||||
title = article.find('h1').get_text(strip=True)
|
||||
print(f"Title: {title}")
|
||||
def __repr__(self):
|
||||
return f"Part({self.title}, {len(self.exercises)} exercises)"
|
||||
|
||||
class Chapitre:
|
||||
def __init__(self, title, url=None):
|
||||
self.title = title
|
||||
self.parts = []
|
||||
self.url = url
|
||||
|
||||
def add_part(self, part):
|
||||
self.parts.append(part)
|
||||
|
||||
def __repr__(self):
|
||||
return f"Chapitre({self.title}, {len(self.parts)} parts)"
|
||||
|
||||
def show(self):
|
||||
print(f"Chapitre: {self.title}")
|
||||
for part in self.parts:
|
||||
print(f" Part: {part.title}")
|
||||
for exercise in part.exercises:
|
||||
print(f" Exercise: {exercise.number}, {exercise.title}, {exercise.id}, {exercise.stars} stars")
|
||||
print(f" Enonce: {exercise.enonce}")
|
||||
print(f" Indication: {exercise.indication}")
|
||||
print(f" Answer: {exercise.answer}")
|
||||
|
||||
class LatexFile:
|
||||
def __init__(self, filename):
|
||||
self.filename = filename
|
||||
self.content = ""
|
||||
|
||||
def add_header(self):
|
||||
with open('parts/header.tex', 'r') as f:
|
||||
self.content += f.read()
|
||||
|
||||
def add_footer(self):
|
||||
with open('parts/footer.tex', 'r') as f:
|
||||
self.content += f.read()
|
||||
|
||||
def add_content(self, content):
|
||||
self.content += content + "\n"
|
||||
|
||||
def add_pagebreak(self):
|
||||
self.content += "\\newpage\n"
|
||||
|
||||
def add_source(self, chapitre):
|
||||
self.content += f"\\noindent\\textbf{{Chapitre:}} \\href{{{chapitre.url}}}{{{chapitre.title}}}\n"
|
||||
|
||||
def add_exercise(self, exercise):
|
||||
self.content += f"""\\exercice{{{exercise.number}, name, date, {exercise.stars}, {exercise.title}}}\n"""
|
||||
self.content += f"""\\enonce{{{exercise.number}}}{{}}\n"""
|
||||
self.content += f"{exercise.enonce}\n"
|
||||
self.content += f"""\\finenonce{{{exercise.number}}}\n"""
|
||||
self.content += f"""\\finexercice\n"""
|
||||
self.content += "\n"
|
||||
|
||||
def add_indication(self, exercise):
|
||||
self.content += f"""\\indication{{{exercise.number}}}\n"""
|
||||
self.content += f"{exercise.indication}\n"
|
||||
self.content += f"""\\finindication\n"""
|
||||
self.content += "\n"
|
||||
|
||||
def add_answer(self, exercise):
|
||||
self.content += f"""\\correction{{{exercise.number}}}\n"""
|
||||
self.content += f"{exercise.answer}\n"
|
||||
self.content += f"""\\fincorrection\n"""
|
||||
self.content += "\n"
|
||||
|
||||
def sanitize(self):
|
||||
# return ''.join(c for c in self.content if ord(c) < 128)
|
||||
return self.content.encode('utf-8', 'ignore').decode('utf-8')
|
||||
|
||||
def save(self):
|
||||
with open(self.filename, 'w') as f:
|
||||
f.write(self.sanitize())
|
||||
|
||||
def generate_latex(self, chapitre):
|
||||
self.add_header()
|
||||
|
||||
self.add_source(chapitre)
|
||||
|
||||
self.add_content(f"\\title{{{chapitre.title}}}")
|
||||
for part in chapitre.parts:
|
||||
self.add_content(f"\\section{{{part.title}}}")
|
||||
for ex in part.exercises:
|
||||
self.add_exercise(ex)
|
||||
|
||||
# Find all part titles
|
||||
part_titles = article.find_all('div', class_='titrepartie')
|
||||
for part in part_titles:
|
||||
part_text = part.get_text(strip=True)
|
||||
print(f"Part of the File: {part_text}")
|
||||
self.add_pagebreak()
|
||||
|
||||
for part in chapitre.parts:
|
||||
for ex in part.exercises:
|
||||
self.add_indication(ex)
|
||||
self.add_pagebreak()
|
||||
|
||||
for part in chapitre.parts:
|
||||
for ex in part.exercises:
|
||||
self.add_answer(ex)
|
||||
|
||||
self.add_footer()
|
||||
self.save()
|
||||
|
||||
def fetch_chapitre(quoi):
|
||||
headers = {
|
||||
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
||||
'accept-language': 'fr-FR,fr;q=0.7',
|
||||
'cache-control': 'no-cache',
|
||||
'pragma': 'no-cache',
|
||||
'priority': 'u=0, i',
|
||||
'sec-ch-ua': '"Brave";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
|
||||
'sec-ch-ua-mobile': '?0',
|
||||
'sec-ch-ua-platform': '"macOS"',
|
||||
'sec-fetch-dest': 'document',
|
||||
'sec-fetch-mode': 'navigate',
|
||||
'sec-fetch-site': 'same-origin',
|
||||
'sec-fetch-user': '?1',
|
||||
'sec-gpc': '1',
|
||||
'upgrade-insecure-requests': '1',
|
||||
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
||||
}
|
||||
params = {
|
||||
'action': 'affiche',
|
||||
'quoi': quoi,
|
||||
'type': 'fexo',
|
||||
}
|
||||
response = requests.get('https://bibmath.net/ressources/index.php', headers=headers, params=params)
|
||||
if response.status_code == 200:
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
article = soup.find('article', id='contenugauche')
|
||||
|
||||
exercises = article.find_all('div', class_='exo')
|
||||
for exo in exercises:
|
||||
exo_title_div = exo.find('div', class_='titreexo')
|
||||
|
||||
exercise_number = exo_title_div.find_all('span')[1].text.strip()
|
||||
title_start = exo_title_div.text.find(' - ') + 3
|
||||
title_end = exo_title_div.text.find('[Signaler une erreur]')
|
||||
exercise_title = exo_title_div.text[title_start:title_end].strip()
|
||||
if article:
|
||||
title = article.find('h1').get_text(strip=True)
|
||||
chapitre = Chapitre(title, url=response.url)
|
||||
current_part = None
|
||||
for element in article.find_all(recursive=False):
|
||||
if 'titrepartie' in element.get('class', []):
|
||||
part_title = element.get_text(strip=True)
|
||||
current_part = Part(part_title)
|
||||
chapitre.add_part(current_part)
|
||||
elif 'exo' in element.get('class', []) and current_part is not None:
|
||||
exo_title_div = element.find('div', class_='titreexo')
|
||||
|
||||
error_link = exo_title_div.find('a', href=lambda href: href and 'signalerreur.php' in href)['href']
|
||||
parsed_url = urlparse.urlparse(error_link)
|
||||
exercise_id = urlparse.parse_qs(parsed_url.query)['numero'][0]
|
||||
exercise_number = exo_title_div.find_all('span')[1].text.strip()
|
||||
title_start = exo_title_div.text.find(' - ') + 3
|
||||
title_end = exo_title_div.text.find('[Signaler une erreur]')
|
||||
exercise_title = exo_title_div.text[title_start:title_end].strip()
|
||||
|
||||
stars_count = len(exo_title_div.find_all('img'))
|
||||
error_link = exo_title_div.find('a', href=lambda href: href and 'signalerreur.php' in href)['href']
|
||||
parsed_url = urlparse.urlparse(error_link)
|
||||
exercise_id = urlparse.parse_qs(parsed_url.query)['numero'][0]
|
||||
|
||||
enonce = exo.find('div', class_='enonce').find('div', class_='inner').get_text(strip=True)
|
||||
indication_div = exo.find('div', class_='indication')
|
||||
indication = (
|
||||
indication_div.find('div', class_='inner').get_text(strip=True)
|
||||
if indication_div else "No indication"
|
||||
)
|
||||
stars_count = len(exo_title_div.find_all('img'))
|
||||
|
||||
answer_div = exo.find('div', class_='corrige')
|
||||
answer = (
|
||||
answer_div.find('div', class_='inner').get_text(strip=True)
|
||||
if answer_div else "No answer"
|
||||
)
|
||||
# print(f"Exercise Number: {exercise_number}")
|
||||
# print(f"Exercise Title: {exercise_title}")
|
||||
# print(f"Exercise ID: {exercise_id}")
|
||||
# print(f"Stars: {stars_count} stars")
|
||||
# print(f"Enonce: {enonce}")
|
||||
# print(f"Indication: {indication}")
|
||||
# print(f"Answer: {answer}")
|
||||
enonce = element.find('div', class_='enonce').find('div', class_='inner').get_text(strip=True)
|
||||
indication_div = element.find('div', class_='indication')
|
||||
indication = (
|
||||
indication_div.find('div', class_='inner').get_text(strip=True)
|
||||
if indication_div else "No indication"
|
||||
)
|
||||
|
||||
answer_div = element.find('div', class_='corrige')
|
||||
answer = (
|
||||
answer_div.find('div', class_='inner').get_text(strip=True)
|
||||
if answer_div else "No answer"
|
||||
)
|
||||
|
||||
exercise = Exercise(
|
||||
number=exercise_number,
|
||||
title=exercise_title,
|
||||
id=exercise_id,
|
||||
stars=stars_count,
|
||||
enonce=enonce,
|
||||
indication=indication,
|
||||
answer=answer
|
||||
)
|
||||
|
||||
current_part.add_exercise(exercise)
|
||||
return chapitre
|
||||
else:
|
||||
print("Article with id 'contenugauche' not found.")
|
||||
else:
|
||||
print("Article with id 'contenugauche' not found.")
|
||||
else:
|
||||
print(f"Request failed with status code: {response.status_code}")
|
||||
print(f"Request failed with status code: {response.status_code}")
|
||||
|
||||
quoi = 'mpsi/feuillesexo/matrices'
|
||||
chapitre = fetch_chapitre(quoi)
|
||||
|
||||
latex_file = LatexFile('file.tex')
|
||||
latex_file.generate_latex(chapitre)
|
||||
print("LaTeX file generated: file.tex")
|
||||
@@ -106,5 +106,5 @@
|
||||
\usepackage{graphics}
|
||||
\usepackage[all]{xy}
|
||||
|
||||
|
||||
\begin{document}
|
||||
\pagestyle{empty}
|
||||
\begin{document}
|
||||
|
||||
Reference in New Issue
Block a user