Mikhail Sokolov 11 сар өмнө
parent
commit
69b3e05655
2 өөрчлөгдсөн 115 нэмэгдсэн , 0 устгасан
  1. 3 0
      README.md
  2. 112 0
      aliexpress-one-price-parser.py

+ 3 - 0
README.md

@@ -0,0 +1,3 @@
+python3 aliexpress-one-price-parser.py aliexpress.ru/one-price bluetooth
+
+python3 aliexpress-one-price-parser.py aliexpress.ru/one-price носки

+ 112 - 0
aliexpress-one-price-parser.py

@@ -0,0 +1,112 @@
+import warnings
+warnings.filterwarnings("ignore", message="python-telegram-bot is using upstream urllib3")
+
+import time
+import sys
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+from bs4 import BeautifulSoup
+from telegram import Bot, InputMediaPhoto
+from collections import defaultdict
+
+# Telegram Bot Token and Chat ID
+TELEGRAM_BOT_TOKEN = ""
+TELEGRAM_CHAT_ID = ""
+
+# Function to ensure URLs start with "https://"
+def format_url(url):
+    if url.startswith("//"):
+        return f"https:{url}"
+    elif not url.startswith(("http://", "https://")):
+        return f"https://{url}"
+    return url
+
+# Function to scroll the page a fixed number of times
+def scroll_page(driver, scroll_pause_time=5):  # Increased wait time
+    for _ in range(50):
+        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+        time.sleep(scroll_pause_time)
+
+# Function to save fully rendered HTML to a file for debugging
+def save_rendered_html(driver, filename="debug.html"):
+    rendered_html = driver.execute_script("return document.documentElement.outerHTML;")
+    with open(filename, "w", encoding="utf-8") as file:
+        file.write(rendered_html)
+    print(f"Fully rendered HTML saved to {filename}")
+
+# Function to search for the word in the `alt` attribute and group `data-src` by `alt` text
+def search_word_in_alt_tags(driver, word):
+    rendered_html = driver.execute_script("return document.documentElement.outerHTML;")
+    save_rendered_html(driver)
+
+    soup = BeautifulSoup(rendered_html, 'html.parser')
+    alt_to_data_src = defaultdict(list)
+
+    img_tags = soup.find_all('img', alt=True)
+    for img in img_tags:
+        alt_text = img.get('alt', '').lower()
+        if word.lower() in alt_text:
+            data_src = img.get('data-src')
+            if data_src:
+                data_src = format_url(data_src)
+                alt_to_data_src[alt_text].append(data_src)
+
+    return alt_to_data_src
+
+# Function to send media group to Telegram (limit to 10 photos)
+def send_to_telegram(alt_text, data_src_list):
+    bot = Bot(token=TELEGRAM_BOT_TOKEN)
+    data_src_list = data_src_list[:10]
+
+    media_group = []
+    for index, data_src in enumerate(data_src_list):
+        media = InputMediaPhoto(media=data_src, caption=alt_text if index == 0 else "")
+        media_group.append(media)
+
+    bot.send_media_group(chat_id=TELEGRAM_CHAT_ID, media=media_group)
+    print(f"Sent {len(media_group)} image(s) with alt text: '{alt_text}'")
+
+# Main function
+def main(url, word):
+    # Set up Chrome options for headless mode
+    chrome_options = Options()
+    chrome_options.add_argument("--headless")
+    chrome_options.add_argument("--disable-gpu")
+    chrome_options.add_argument("--no-sandbox")
+    chrome_options.add_argument("--disable-dev-shm-usage")
+    chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
+    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
+    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
+    chrome_options.add_experimental_option("useAutomationExtension", False)
+
+    # Set up the WebDriver with headless options
+    driver = webdriver.Chrome(options=chrome_options)
+
+    try:
+        driver.get(url)
+        scroll_page(driver)
+
+        alt_to_data_src = search_word_in_alt_tags(driver, word)
+        if alt_to_data_src:
+            print(f"Found {len(alt_to_data_src)} unique alt text(s) containing the word '{word}'.")
+            for alt_text, data_src_list in alt_to_data_src.items():
+                send_to_telegram(alt_text, data_src_list)
+        else:
+            print(f"The word '{word}' was not found in any alt attributes after scrolling 50 pages.")
+
+    finally:
+        driver.quit()
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print("Usage: python script.py <url> <word_to_search>")
+        sys.exit(1)
+
+    url = sys.argv[1]
+    word_to_search = sys.argv[2]
+
+    if not url.startswith(("http://", "https://")):
+        url = "https://" + url
+
+    main(url, word_to_search)