aliexpress-one-price-parser.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114
  1. #!/usr/bin/python3
  2. import warnings
  3. warnings.filterwarnings("ignore", message="python-telegram-bot is using upstream urllib3")
  4. import time
  5. import sys
  6. from selenium import webdriver
  7. from selenium.webdriver.chrome.options import Options
  8. from selenium.webdriver.common.by import By
  9. from bs4 import BeautifulSoup
  10. from telegram import Bot, InputMediaPhoto
  11. from collections import defaultdict
  12. # Telegram Bot Token and Chat ID
  13. TELEGRAM_BOT_TOKEN = ""
  14. TELEGRAM_CHAT_ID = ""
  15. # Function to ensure URLs start with "https://"
  16. def format_url(url):
  17. if url.startswith("//"):
  18. return f"https:{url}"
  19. elif not url.startswith(("http://", "https://")):
  20. return f"https://{url}"
  21. return url
  22. # Function to scroll the page a fixed number of times
  23. def scroll_page(driver, scroll_pause_time=5): # Increased wait time
  24. for _ in range(50):
  25. driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
  26. time.sleep(scroll_pause_time)
  27. # Function to save fully rendered HTML to a file for debugging
  28. def save_rendered_html(driver, filename="debug.html"):
  29. rendered_html = driver.execute_script("return document.documentElement.outerHTML;")
  30. with open(filename, "w", encoding="utf-8") as file:
  31. file.write(rendered_html)
  32. print(f"Fully rendered HTML saved to {filename}")
  33. # Function to search for the word in the `alt` attribute and group `data-src` by `alt` text
  34. def search_word_in_alt_tags(driver, word):
  35. rendered_html = driver.execute_script("return document.documentElement.outerHTML;")
  36. save_rendered_html(driver)
  37. soup = BeautifulSoup(rendered_html, 'html.parser')
  38. alt_to_data_src = defaultdict(list)
  39. img_tags = soup.find_all('img', alt=True)
  40. for img in img_tags:
  41. alt_text = img.get('alt', '').lower()
  42. if word.lower() in alt_text:
  43. data_src = img.get('data-src')
  44. if data_src:
  45. data_src = format_url(data_src)
  46. alt_to_data_src[alt_text].append(data_src)
  47. return alt_to_data_src
  48. # Function to send media group to Telegram (limit to 10 photos)
  49. def send_to_telegram(alt_text, data_src_list):
  50. bot = Bot(token=TELEGRAM_BOT_TOKEN)
  51. data_src_list = data_src_list[:10]
  52. media_group = []
  53. for index, data_src in enumerate(data_src_list):
  54. media = InputMediaPhoto(media=data_src, caption=alt_text if index == 0 else "")
  55. media_group.append(media)
  56. bot.send_media_group(chat_id=TELEGRAM_CHAT_ID, media=media_group)
  57. print(f"Sent {len(media_group)} image(s) with alt text: '{alt_text}'")
  58. # Main function
  59. def main(url, word):
  60. # Set up Chrome options for headless mode
  61. chrome_options = Options()
  62. chrome_options.add_argument("--headless")
  63. chrome_options.add_argument("--disable-gpu")
  64. chrome_options.add_argument("--no-sandbox")
  65. chrome_options.add_argument("--disable-dev-shm-usage")
  66. chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
  67. chrome_options.add_argument("--disable-blink-features=AutomationControlled")
  68. chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
  69. chrome_options.add_experimental_option("useAutomationExtension", False)
  70. # Set up the WebDriver with headless options
  71. driver = webdriver.Chrome(options=chrome_options)
  72. try:
  73. driver.get(url)
  74. scroll_page(driver)
  75. alt_to_data_src = search_word_in_alt_tags(driver, word)
  76. if alt_to_data_src:
  77. print(f"Found {len(alt_to_data_src)} unique alt text(s) containing the word '{word}'.")
  78. for alt_text, data_src_list in alt_to_data_src.items():
  79. send_to_telegram(alt_text, data_src_list)
  80. else:
  81. print(f"The word '{word}' was not found in any alt attributes after scrolling 50 pages.")
  82. finally:
  83. driver.quit()
  84. if __name__ == "__main__":
  85. if len(sys.argv) != 3:
  86. print("Usage: python script.py <url> <word_to_search>")
  87. sys.exit(1)
  88. url = sys.argv[1]
  89. word_to_search = sys.argv[2]
  90. if not url.startswith(("http://", "https://")):
  91. url = "https://" + url
  92. main(url, word_to_search)