mirror of
https://github.com/NohamR/Manuelcontent.git
synced 2025-05-23 16:49:24 +00:00
120 lines
3.4 KiB
Plaintext
120 lines
3.4 KiB
Plaintext
{
|
|
"nbformat": 4,
|
|
"nbformat_minor": 0,
|
|
"metadata": {
|
|
"colab": {
|
|
"provenance": [],
|
|
"authorship_tag": "ABX9TyO+gk/tdKqfk6oA5OoSzNwI",
|
|
"include_colab_link": true
|
|
},
|
|
"kernelspec": {
|
|
"name": "python3",
|
|
"display_name": "Python 3"
|
|
},
|
|
"language_info": {
|
|
"name": "python"
|
|
}
|
|
},
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "view-in-github",
|
|
"colab_type": "text"
|
|
},
|
|
"source": [
|
|
"<a href=\"https://colab.research.google.com/github/NohamR/Manuelcontent/blob/main/LLS_scrap.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "j3GU2ZHVpbja"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import urllib.request\n",
|
|
"import csv\n",
|
|
"import requests\n",
|
|
"import os\n",
|
|
"import threading\n",
|
|
"\n",
|
|
"def download_images(code):\n",
|
|
" for page in range(1, 1000):\n",
|
|
" url = \"https://assets.lls.fr/books/\" + str(code[5]) + \"/print/\" + str(page) + \".jpg\"\n",
|
|
" print(url)\n",
|
|
" r = requests.get(url, allow_redirects=True)\n",
|
|
" if r.status_code == 200:\n",
|
|
" path = 'gen/' + str(code[5]) + '/page' + str(page) + '.jpg'\n",
|
|
" urllib.request.urlretrieve(url, path)\n",
|
|
" else:\n",
|
|
" break\n",
|
|
"\n",
|
|
"code = []\n",
|
|
"with open('liste.csv', newline='', encoding=\"utf8\") as csvfile:\n",
|
|
" spamreader = csv.reader(csvfile, delimiter=',')\n",
|
|
" for row in spamreader:\n",
|
|
" code.append(row)\n",
|
|
"\n",
|
|
"threads = []\n",
|
|
"for numcode in range(len(code)):\n",
|
|
" os.mkdir('gen/' + str(code[numcode][5]))\n",
|
|
" t = threading.Thread(target=download_images, args=(code[numcode],))\n",
|
|
" threads.append(t)\n",
|
|
" t.start()\n",
|
|
"\n",
|
|
"for t in threads:\n",
|
|
" t.join()\n",
|
|
"\n",
|
|
"print(\"Toutes les images ont été téléchargées avec succès\") \n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"source": [
|
|
"from google.colab import drive\n",
|
|
"drive.mount('/content/drive')"
|
|
],
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "glMJLK-btM1X",
|
|
"outputId": "4d31ec89-f7b7-4b21-ff88-5ab30a9b4707"
|
|
},
|
|
"execution_count": 12,
|
|
"outputs": [
|
|
{
|
|
"output_type": "stream",
|
|
"name": "stdout",
|
|
"text": [
|
|
"Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"
|
|
]
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"source": [
|
|
"cp -r /content/gen.zip /content/gen2.zip\n"
|
|
],
|
|
"metadata": {
|
|
"id": "mb_7KJLIxPo-"
|
|
},
|
|
"execution_count": 15,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"source": [
|
|
"!zip -r gen.zip /content/gen/\n"
|
|
],
|
|
"metadata": {
|
|
"id": "LP8SCsPr2pLm"
|
|
},
|
|
"execution_count": null,
|
|
"outputs": []
|
|
}
|
|
]
|
|
} |