|
@@ -0,0 +1,112 @@
|
|
|
|
|
+import warnings
|
|
|
|
|
+warnings.filterwarnings("ignore", message="python-telegram-bot is using upstream urllib3")
|
|
|
|
|
+
|
|
|
|
|
+import time
|
|
|
|
|
+import sys
|
|
|
|
|
+from selenium import webdriver
|
|
|
|
|
+from selenium.webdriver.chrome.options import Options
|
|
|
|
|
+from selenium.webdriver.common.by import By
|
|
|
|
|
+from bs4 import BeautifulSoup
|
|
|
|
|
+from telegram import Bot, InputMediaPhoto
|
|
|
|
|
+from collections import defaultdict
|
|
|
|
|
+
|
|
|
|
|
+# Telegram Bot Token and Chat ID
|
|
|
|
|
+TELEGRAM_BOT_TOKEN = ""
|
|
|
|
|
+TELEGRAM_CHAT_ID = ""
|
|
|
|
|
+
|
|
|
|
|
+# Function to ensure URLs start with "https://"
|
|
|
|
|
+def format_url(url):
|
|
|
|
|
+ if url.startswith("//"):
|
|
|
|
|
+ return f"https:{url}"
|
|
|
|
|
+ elif not url.startswith(("http://", "https://")):
|
|
|
|
|
+ return f"https://{url}"
|
|
|
|
|
+ return url
|
|
|
|
|
+
|
|
|
|
|
+# Function to scroll the page a fixed number of times
|
|
|
|
|
+def scroll_page(driver, scroll_pause_time=5): # Increased wait time
|
|
|
|
|
+ for _ in range(50):
|
|
|
|
|
+ driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
|
|
|
|
+ time.sleep(scroll_pause_time)
|
|
|
|
|
+
|
|
|
|
|
+# Function to save fully rendered HTML to a file for debugging
|
|
|
|
|
+def save_rendered_html(driver, filename="debug.html"):
|
|
|
|
|
+ rendered_html = driver.execute_script("return document.documentElement.outerHTML;")
|
|
|
|
|
+ with open(filename, "w", encoding="utf-8") as file:
|
|
|
|
|
+ file.write(rendered_html)
|
|
|
|
|
+ print(f"Fully rendered HTML saved to {filename}")
|
|
|
|
|
+
|
|
|
|
|
+# Function to search for the word in the `alt` attribute and group `data-src` by `alt` text
|
|
|
|
|
+def search_word_in_alt_tags(driver, word):
|
|
|
|
|
+ rendered_html = driver.execute_script("return document.documentElement.outerHTML;")
|
|
|
|
|
+ save_rendered_html(driver)
|
|
|
|
|
+
|
|
|
|
|
+ soup = BeautifulSoup(rendered_html, 'html.parser')
|
|
|
|
|
+ alt_to_data_src = defaultdict(list)
|
|
|
|
|
+
|
|
|
|
|
+ img_tags = soup.find_all('img', alt=True)
|
|
|
|
|
+ for img in img_tags:
|
|
|
|
|
+ alt_text = img.get('alt', '').lower()
|
|
|
|
|
+ if word.lower() in alt_text:
|
|
|
|
|
+ data_src = img.get('data-src')
|
|
|
|
|
+ if data_src:
|
|
|
|
|
+ data_src = format_url(data_src)
|
|
|
|
|
+ alt_to_data_src[alt_text].append(data_src)
|
|
|
|
|
+
|
|
|
|
|
+ return alt_to_data_src
|
|
|
|
|
+
|
|
|
|
|
+# Function to send media group to Telegram (limit to 10 photos)
|
|
|
|
|
+def send_to_telegram(alt_text, data_src_list):
|
|
|
|
|
+ bot = Bot(token=TELEGRAM_BOT_TOKEN)
|
|
|
|
|
+ data_src_list = data_src_list[:10]
|
|
|
|
|
+
|
|
|
|
|
+ media_group = []
|
|
|
|
|
+ for index, data_src in enumerate(data_src_list):
|
|
|
|
|
+ media = InputMediaPhoto(media=data_src, caption=alt_text if index == 0 else "")
|
|
|
|
|
+ media_group.append(media)
|
|
|
|
|
+
|
|
|
|
|
+ bot.send_media_group(chat_id=TELEGRAM_CHAT_ID, media=media_group)
|
|
|
|
|
+ print(f"Sent {len(media_group)} image(s) with alt text: '{alt_text}'")
|
|
|
|
|
+
|
|
|
|
|
+# Main function
|
|
|
|
|
+def main(url, word):
|
|
|
|
|
+ # Set up Chrome options for headless mode
|
|
|
|
|
+ chrome_options = Options()
|
|
|
|
|
+ chrome_options.add_argument("--headless")
|
|
|
|
|
+ chrome_options.add_argument("--disable-gpu")
|
|
|
|
|
+ chrome_options.add_argument("--no-sandbox")
|
|
|
|
|
+ chrome_options.add_argument("--disable-dev-shm-usage")
|
|
|
|
|
+ chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
|
|
|
|
+ chrome_options.add_argument("--disable-blink-features=AutomationControlled")
|
|
|
|
|
+ chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
|
|
|
|
+ chrome_options.add_experimental_option("useAutomationExtension", False)
|
|
|
|
|
+
|
|
|
|
|
+ # Set up the WebDriver with headless options
|
|
|
|
|
+ driver = webdriver.Chrome(options=chrome_options)
|
|
|
|
|
+
|
|
|
|
|
+ try:
|
|
|
|
|
+ driver.get(url)
|
|
|
|
|
+ scroll_page(driver)
|
|
|
|
|
+
|
|
|
|
|
+ alt_to_data_src = search_word_in_alt_tags(driver, word)
|
|
|
|
|
+ if alt_to_data_src:
|
|
|
|
|
+ print(f"Found {len(alt_to_data_src)} unique alt text(s) containing the word '{word}'.")
|
|
|
|
|
+ for alt_text, data_src_list in alt_to_data_src.items():
|
|
|
|
|
+ send_to_telegram(alt_text, data_src_list)
|
|
|
|
|
+ else:
|
|
|
|
|
+ print(f"The word '{word}' was not found in any alt attributes after scrolling 50 pages.")
|
|
|
|
|
+
|
|
|
|
|
+ finally:
|
|
|
|
|
+ driver.quit()
|
|
|
|
|
+
|
|
|
|
|
+if __name__ == "__main__":
|
|
|
|
|
+ if len(sys.argv) != 3:
|
|
|
|
|
+ print("Usage: python script.py <url> <word_to_search>")
|
|
|
|
|
+ sys.exit(1)
|
|
|
|
|
+
|
|
|
|
|
+ url = sys.argv[1]
|
|
|
|
|
+ word_to_search = sys.argv[2]
|
|
|
|
|
+
|
|
|
|
|
+ if not url.startswith(("http://", "https://")):
|
|
|
|
|
+ url = "https://" + url
|
|
|
|
|
+
|
|
|
|
|
+ main(url, word_to_search)
|