#!/usr/bin/python3 import warnings warnings.filterwarnings("ignore", message="python-telegram-bot is using upstream urllib3") import time import sys from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from bs4 import BeautifulSoup from telegram import Bot, InputMediaPhoto from collections import defaultdict # Telegram Bot Token and Chat ID TELEGRAM_BOT_TOKEN = "" TELEGRAM_CHAT_ID = "" # Function to ensure URLs start with "https://" def format_url(url): if url.startswith("//"): return f"https:{url}" elif not url.startswith(("http://", "https://")): return f"https://{url}" return url # Function to scroll the page a fixed number of times def scroll_page(driver, scroll_pause_time=5): # Increased wait time for _ in range(50): driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(scroll_pause_time) # Function to save fully rendered HTML to a file for debugging def save_rendered_html(driver, filename="debug.html"): rendered_html = driver.execute_script("return document.documentElement.outerHTML;") with open(filename, "w", encoding="utf-8") as file: file.write(rendered_html) print(f"Fully rendered HTML saved to {filename}") # Function to search for the word in the `alt` attribute and group `data-src` by `alt` text def search_word_in_alt_tags(driver, word): rendered_html = driver.execute_script("return document.documentElement.outerHTML;") save_rendered_html(driver) soup = BeautifulSoup(rendered_html, 'html.parser') alt_to_data_src = defaultdict(list) img_tags = soup.find_all('img', alt=True) for img in img_tags: alt_text = img.get('alt', '').lower() if word.lower() in alt_text: data_src = img.get('data-src') if data_src: data_src = format_url(data_src) alt_to_data_src[alt_text].append(data_src) return alt_to_data_src # Function to send media group to Telegram (limit to 10 photos) def send_to_telegram(alt_text, data_src_list): bot = Bot(token=TELEGRAM_BOT_TOKEN) data_src_list = data_src_list[:10] media_group = [] for index, data_src in enumerate(data_src_list): media = InputMediaPhoto(media=data_src, caption=alt_text if index == 0 else "") media_group.append(media) bot.send_media_group(chat_id=TELEGRAM_CHAT_ID, media=media_group) print(f"Sent {len(media_group)} image(s) with alt text: '{alt_text}'") # Main function def main(url, word): # Set up Chrome options for headless mode chrome_options = Options() chrome_options.add_argument("--headless") chrome_options.add_argument("--disable-gpu") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") chrome_options.add_argument("--disable-blink-features=AutomationControlled") chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) chrome_options.add_experimental_option("useAutomationExtension", False) # Set up the WebDriver with headless options driver = webdriver.Chrome(options=chrome_options) try: driver.get(url) scroll_page(driver) alt_to_data_src = search_word_in_alt_tags(driver, word) if alt_to_data_src: print(f"Found {len(alt_to_data_src)} unique alt text(s) containing the word '{word}'.") for alt_text, data_src_list in alt_to_data_src.items(): send_to_telegram(alt_text, data_src_list) else: print(f"The word '{word}' was not found in any alt attributes after scrolling 50 pages.") finally: driver.quit() if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: python script.py ") sys.exit(1) url = sys.argv[1] word_to_search = sys.argv[2] if not url.startswith(("http://", "https://")): url = "https://" + url main(url, word_to_search)