Manuelcontent/LLS_scrap.ipynb
2023-05-02 16:49:04 +02:00

120 lines
3.4 KiB
Plaintext

{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"authorship_tag": "ABX9TyO+gk/tdKqfk6oA5OoSzNwI",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/github/NohamR/Manuelcontent/blob/main/LLS_scrap.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "j3GU2ZHVpbja"
},
"outputs": [],
"source": [
"import urllib.request\n",
"import csv\n",
"import requests\n",
"import os\n",
"import threading\n",
"\n",
"def download_images(code):\n",
" for page in range(1, 1000):\n",
" url = \"https://assets.lls.fr/books/\" + str(code[5]) + \"/print/\" + str(page) + \".jpg\"\n",
" print(url)\n",
" r = requests.get(url, allow_redirects=True)\n",
" if r.status_code == 200:\n",
" path = 'gen/' + str(code[5]) + '/page' + str(page) + '.jpg'\n",
" urllib.request.urlretrieve(url, path)\n",
" else:\n",
" break\n",
"\n",
"code = []\n",
"with open('liste.csv', newline='', encoding=\"utf8\") as csvfile:\n",
" spamreader = csv.reader(csvfile, delimiter=',')\n",
" for row in spamreader:\n",
" code.append(row)\n",
"\n",
"threads = []\n",
"for numcode in range(len(code)):\n",
" os.mkdir('gen/' + str(code[numcode][5]))\n",
" t = threading.Thread(target=download_images, args=(code[numcode],))\n",
" threads.append(t)\n",
" t.start()\n",
"\n",
"for t in threads:\n",
" t.join()\n",
"\n",
"print(\"Toutes les images ont été téléchargées avec succès\") \n"
]
},
{
"cell_type": "code",
"source": [
"from google.colab import drive\n",
"drive.mount('/content/drive')"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "glMJLK-btM1X",
"outputId": "4d31ec89-f7b7-4b21-ff88-5ab30a9b4707"
},
"execution_count": 12,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"cp -r /content/gen.zip /content/gen2.zip\n"
],
"metadata": {
"id": "mb_7KJLIxPo-"
},
"execution_count": 15,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!zip -r gen.zip /content/gen/\n"
],
"metadata": {
"id": "LP8SCsPr2pLm"
},
"execution_count": null,
"outputs": []
}
]
}