| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114 |
- #!/usr/bin/python3
- import warnings
- warnings.filterwarnings("ignore", message="python-telegram-bot is using upstream urllib3")
- import time
- import sys
- from selenium import webdriver
- from selenium.webdriver.chrome.options import Options
- from selenium.webdriver.common.by import By
- from bs4 import BeautifulSoup
- from telegram import Bot, InputMediaPhoto
- from collections import defaultdict
- # Telegram Bot Token and Chat ID
- TELEGRAM_BOT_TOKEN = ""
- TELEGRAM_CHAT_ID = ""
- # Function to ensure URLs start with "https://"
- def format_url(url):
- if url.startswith("//"):
- return f"https:{url}"
- elif not url.startswith(("http://", "https://")):
- return f"https://{url}"
- return url
- # Function to scroll the page a fixed number of times
- def scroll_page(driver, scroll_pause_time=5): # Increased wait time
- for _ in range(50):
- driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
- time.sleep(scroll_pause_time)
- # Function to save fully rendered HTML to a file for debugging
- def save_rendered_html(driver, filename="debug.html"):
- rendered_html = driver.execute_script("return document.documentElement.outerHTML;")
- with open(filename, "w", encoding="utf-8") as file:
- file.write(rendered_html)
- print(f"Fully rendered HTML saved to {filename}")
- # Function to search for the word in the `alt` attribute and group `data-src` by `alt` text
- def search_word_in_alt_tags(driver, word):
- rendered_html = driver.execute_script("return document.documentElement.outerHTML;")
- save_rendered_html(driver)
- soup = BeautifulSoup(rendered_html, 'html.parser')
- alt_to_data_src = defaultdict(list)
- img_tags = soup.find_all('img', alt=True)
- for img in img_tags:
- alt_text = img.get('alt', '').lower()
- if word.lower() in alt_text:
- data_src = img.get('data-src')
- if data_src:
- data_src = format_url(data_src)
- alt_to_data_src[alt_text].append(data_src)
- return alt_to_data_src
- # Function to send media group to Telegram (limit to 10 photos)
- def send_to_telegram(alt_text, data_src_list):
- bot = Bot(token=TELEGRAM_BOT_TOKEN)
- data_src_list = data_src_list[:10]
- media_group = []
- for index, data_src in enumerate(data_src_list):
- media = InputMediaPhoto(media=data_src, caption=alt_text if index == 0 else "")
- media_group.append(media)
- bot.send_media_group(chat_id=TELEGRAM_CHAT_ID, media=media_group)
- print(f"Sent {len(media_group)} image(s) with alt text: '{alt_text}'")
- # Main function
- def main(url, word):
- # Set up Chrome options for headless mode
- chrome_options = Options()
- chrome_options.add_argument("--headless")
- chrome_options.add_argument("--disable-gpu")
- chrome_options.add_argument("--no-sandbox")
- chrome_options.add_argument("--disable-dev-shm-usage")
- chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
- chrome_options.add_argument("--disable-blink-features=AutomationControlled")
- chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
- chrome_options.add_experimental_option("useAutomationExtension", False)
- # Set up the WebDriver with headless options
- driver = webdriver.Chrome(options=chrome_options)
- try:
- driver.get(url)
- scroll_page(driver)
- alt_to_data_src = search_word_in_alt_tags(driver, word)
- if alt_to_data_src:
- print(f"Found {len(alt_to_data_src)} unique alt text(s) containing the word '{word}'.")
- for alt_text, data_src_list in alt_to_data_src.items():
- send_to_telegram(alt_text, data_src_list)
- else:
- print(f"The word '{word}' was not found in any alt attributes after scrolling 50 pages.")
- finally:
- driver.quit()
- if __name__ == "__main__":
- if len(sys.argv) != 3:
- print("Usage: python script.py <url> <word_to_search>")
- sys.exit(1)
- url = sys.argv[1]
- word_to_search = sys.argv[2]
- if not url.startswith(("http://", "https://")):
- url = "https://" + url
- main(url, word_to_search)
|