mirror of https://github.com/bmaltais/kohya_ss
40 lines
1.3 KiB
Python
40 lines
1.3 KiB
Python
import os
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from urllib.parse import urljoin
|
|
from html2text import html2text
|
|
|
|
# Specify the URL of the webpage you want to scrape
|
|
url = "https://hoshikat-hatenablog-com.translate.goog/entry/2023/05/26/223229?_x_tr_sl=auto&_x_tr_tl=en&_x_tr_hl=en-US&_x_tr_pto=wapp"
|
|
|
|
# Send HTTP request to the specified URL and save the response from server in a response object called r
|
|
r = requests.get(url)
|
|
|
|
# Create a BeautifulSoup object and specify the parser
|
|
soup = BeautifulSoup(r.text, 'html.parser')
|
|
|
|
# Find all image tags
|
|
images = soup.find_all('img')
|
|
|
|
for image in images:
|
|
# Get the image source
|
|
image_url = urljoin(url, image['src'])
|
|
|
|
# Get the image response
|
|
image_response = requests.get(image_url, stream=True)
|
|
|
|
# Get the image name by splitting the url at / and taking the last string, and add it to the desired path
|
|
image_name = os.path.join("./logs", image_url.split("/")[-1])
|
|
|
|
# Open the image file in write binary mode
|
|
with open(image_name, 'wb') as file:
|
|
# Write the image data to the file
|
|
file.write(image_response.content)
|
|
|
|
# Convert the HTML content to markdown
|
|
markdown_content = html2text(r.text)
|
|
|
|
# Save the markdown content to a file
|
|
with open("converted_markdown.md", "w", encoding="utf8") as file:
|
|
file.write(markdown_content)
|