From 1299d5a09747157330b7ca759966ee345c082e48 Mon Sep 17 00:00:00 2001 From: DerGamerPanda Date: Thu, 26 Jun 2025 00:48:28 +0200 Subject: [PATCH] Add files via upload --- Dockerfile | 10 ++++ crawler_cli.py | 50 +++++++++++++++++++ crawler_gui.py | 75 ++++++++++++++++++++++++++++ docker-compose.yml | 23 +++++++++ readme.txt | 87 ++++++++++++++++++++++++++++++++ requirements.txt | 6 +++ webcrawler.py | 121 +++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 372 insertions(+) create mode 100644 Dockerfile create mode 100644 crawler_cli.py create mode 100644 crawler_gui.py create mode 100644 docker-compose.yml create mode 100644 readme.txt create mode 100644 requirements.txt create mode 100644 webcrawler.py diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..43604a9 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,10 @@ +FROM python:3.11-slim + +WORKDIR /app + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY . . + +CMD ["streamlit", "run", "crawler_gui.py", "--server.port=8501", "--server.enableCORS=false"] \ No newline at end of file diff --git a/crawler_cli.py b/crawler_cli.py new file mode 100644 index 0000000..83102a7 --- /dev/null +++ b/crawler_cli.py @@ -0,0 +1,50 @@ +from webcrawler import Crawler, save_csv +import argparse +import time +from datetime import datetime + +def parse_comma_list(value): + return [v.strip() for v in value.split(",") if v.strip()] + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Headless Web Crawler with Fulltext + Portscan") + parser.add_argument("urls", help="Comma-separated list of start URLs") + parser.add_argument("keywords", help="Comma-separated list of keywords") + parser.add_argument("-n", "--pages", type=int, default=20, help="Max pages per site") + parser.add_argument("-t", "--threads", type=int, default=5, help="Number of threads") + parser.add_argument("--portscan", action="store_true", help="Enable custom port scan") + parser.add_argument("--ports", type=str, default="", help="Ports to scan (comma-separated)") + parser.add_argument("-o", "--output", help="Output CSV filename (optional)") + + args = parser.parse_args() + + urls = parse_comma_list(args.urls) + keywords = parse_comma_list(args.keywords) + ports = [int(p) for p in parse_comma_list(args.ports)] if args.portscan else [] + + all_results = [] + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + output = args.output or f"crawl_{timestamp}.csv" + + for url in urls: + for keyword in keywords: + print(f"\n🌐 Crawling '{url}' for keyword: '{keyword}'") + crawler = Crawler( + start_url=url, + keyword=keyword, + max_pages=args.pages, + num_threads=args.threads, + enable_portscan=args.portscan, + ports_to_scan=ports + ) + results = crawler.run() + for r in results: + r["keyword"] = keyword + r["start_url"] = url + all_results.extend(results) + + if all_results: + save_csv(all_results, output) + print(f"\nβœ… Done: {len(all_results)} results saved to '{output}'") + else: + print("❌ No matches found.") \ No newline at end of file diff --git a/crawler_gui.py b/crawler_gui.py new file mode 100644 index 0000000..ca28f01 --- /dev/null +++ b/crawler_gui.py @@ -0,0 +1,75 @@ +import streamlit as st +import pandas as pd +import time +from webcrawler import Crawler, save_csv +from datetime import datetime + +st.set_page_config(page_title="Multi-Site Web Crawler", layout="wide") +st.title("πŸ•·οΈ Web Crawler with Fulltext + Port Scanning") + +# User input fields +start_urls_input = st.text_area("Start URLs (comma-separated)", "https://example.com, https://example.org") +keywords_input = st.text_input("Keywords (comma-separated)", "AI, privacy") + +max_pages = st.slider("Maximum pages per site", 5, 1000, 200) +threads = st.slider("Number of threads per crawl", 1, 20, 5) + +enable_scan = st.checkbox("πŸ”Œ Enable custom port scan") +port_input = st.text_input("Ports to scan (comma-separated)", "22,80,443,8080") if enable_scan else "" + +# Session history +if "search_history" not in st.session_state: + st.session_state.search_history = [] + +if st.button("Start Crawling"): + urls = [u.strip() for u in start_urls_input.split(",") if u.strip()] + keywords = [k.strip() for k in keywords_input.split(",") if k.strip()] + ports = [int(p.strip()) for p in port_input.split(",") if p.strip().isdigit()] if enable_scan else [] + + all_results = [] + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename = f"crawl_{timestamp}.csv" + + with st.spinner("Crawling… please wait"): + for url in urls: + for keyword in keywords: + st.write(f"πŸ” Crawling **{url}** for keyword '**{keyword}**'...") + crawler = Crawler( + url, + keyword, + max_pages=max_pages, + num_threads=threads, + enable_portscan=enable_scan, + ports_to_scan=ports + ) + results = crawler.run() + for r in results: + r["keyword"] = keyword + r["start_url"] = url + all_results.extend(results) + + save_csv(all_results, filename) + st.session_state.search_history.append({ + "timestamp": str(datetime.now()), + "start_urls": urls, + "keywords": keywords, + "file": filename + }) + + if all_results: + df = pd.DataFrame(all_results) + st.success(f"βœ… {len(all_results)} results found.") + st.dataframe(df) + with open(filename, "rb") as f: + st.download_button("πŸ“₯ Download CSV", f, file_name=filename, mime="text/csv") + else: + st.warning("No matches found.") + +# Sidebar search history +st.sidebar.header("πŸ“ Search History") +for entry in reversed(st.session_state.search_history[-5:]): + st.sidebar.write(f"πŸ•“ {entry['timestamp']}") + st.sidebar.write("Start URLs:", ", ".join(entry['start_urls'])) + st.sidebar.write("Keywords:", ", ".join(entry['keywords'])) + st.sidebar.write(f"πŸ“Ž File: `{entry['file']}`") + st.sidebar.markdown("---") \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..5f13623 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,23 @@ +version: '3.9' + +services: + webcrawler_gui: + build: . + container_name: webcrawler_gui + ports: + - "8501:8501" + volumes: + - webcrawler_data:/app/output + restart: unless-stopped + command: streamlit run crawler_gui.py --server.port=8501 --server.enableCORS=false + + webcrawler_cli: + build: . + container_name: webcrawler_cli + entrypoint: ["python", "crawler_cli.py", "https://example.com", "AI,robots", "-n", "30", "-t", "6", "-o", "output/scan.csv"] + volumes: + - webcrawler_data:/app/output + restart: "no" + +volumes: + webcrawler_data: \ No newline at end of file diff --git a/readme.txt b/readme.txt new file mode 100644 index 0000000..348a394 --- /dev/null +++ b/readme.txt @@ -0,0 +1,87 @@ +webcrawler_app/ +β”œβ”€β”€ webcrawler.py ← the crawler logic (translated to English) +β”œβ”€β”€ crawler_gui.py ← the Streamlit user interface +β”œβ”€β”€ requirements.txt ← with all your dependencies + +python -m venv venv +.\venv\Scripts\activate # On Windows + + + + +pip install -r requirements.txt + +python -m pip install streamlit + +python -m streamlit run crawler_gui.py + +pyinstaller --noconfirm --onefile --windowed crawler_gui.py + + +πŸ—‚οΈ Project Structure +webcrawler_project/ +β”œβ”€β”€ crawler_cli.py +β”œβ”€β”€ crawler_gui.py +β”œβ”€β”€ webcrawler.py +β”œβ”€β”€ requirements.txt +β”œβ”€β”€ Dockerfile +└── docker-compose.yml + + +- The gui service gives you a browser interface at http://localhost:8501 +- The cli service will run the crawler once and exit β€” perfect for automation + +πŸ“¦ How to Use It +- Build & run all services: +docker-compose up --build +- Want to run only the GUI? +docker-compose up gui +- Want to run only the CLI job? +docker-compose run --rm cli +- Stop services: +docker-compose down + + + +πŸ“ Saving Output Files +By default, output CSVs are saved inside the container. If you'd like to access them from your host system, update your cli service in docker-compose.yml like this: + volumes: + - ./output:/app/output + + +And update your crawler_cli.py to save files in output/filename.csv. + + +πŸ”§ Step 1: Prepare Your Project +Make sure your project folder contains: +crawler-app/ +β”œβ”€β”€ crawler_cli.py +β”œβ”€β”€ crawler_gui.py +β”œβ”€β”€ webcrawler.py +β”œβ”€β”€ requirements.txt +β”œβ”€β”€ Dockerfile +└── docker-compose.yml + + +If you followed the setup I shared earlier, you're all set. + +πŸš€ Step 2: Zip and Upload to Portainer +- Compress your project folder into a .zip file on your local machine. +- Log into Portainer. +- Go to Stacks β†’ Add Stack. +- Give your stack a name (e.g. webcrawler). +- In the Web editor, paste the contents of your docker-compose.yml. +- Scroll down to β€œAdvanced container settings” and upload the zipped project using the β€œUpload resources” option. +- Click Deploy the stack. + +πŸ–₯️ Step 3: Access Your Services +- For the GUI, Portainer will expose port 8501 by default β€” access it via http://your-server-ip:8501. +- The CLI crawler will run on startup and exit. You can re-run it anytime from the Containers view. + +βœ… Optional Enhancements +- Mount a volume for persistent CSV output: +volumes: + - webcrawler_data:/app/output +- Add environment variables or schedules via Portainer’s built-in options. + + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..befb3ff --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +requests +beautifulsoup4 +urllib3 +pandas +streamlit +dnspython \ No newline at end of file diff --git a/webcrawler.py b/webcrawler.py new file mode 100644 index 0000000..3592cd8 --- /dev/null +++ b/webcrawler.py @@ -0,0 +1,121 @@ +import requests +from bs4 import BeautifulSoup +from urllib.parse import urljoin, urlparse +import socket +import argparse +import threading +import queue +import time +import csv +import urllib.robotparser + +def scan_ports(ip, ports, timeout=1.0): + open_ports = [] + for port in ports: + try: + with socket.create_connection((ip, port), timeout=timeout): + open_ports.append(port) + except: + continue + return open_ports + +class Crawler: + def __init__(self, start_url, keyword, max_pages=20, num_threads=5, enable_portscan=False, ports_to_scan=None): + self.start_url = start_url + self.keyword = keyword.lower() + self.max_pages = max_pages + self.num_threads = num_threads + self.enable_portscan = enable_portscan + self.ports_to_scan = ports_to_scan if ports_to_scan else [] + self.visited = set() + self.to_visit = queue.Queue() + self.to_visit.put(start_url) + self.lock = threading.Lock() + self.results = [] + self.rp_cache = {} + + def get_ip(self, url): + try: + return socket.gethostbyname(urlparse(url).hostname) + except: + return "Unresolvable" + + def obey_robots(self, url): + domain = f"{urlparse(url).scheme}://{urlparse(url).netloc}" + if domain in self.rp_cache: + rp = self.rp_cache[domain] + else: + rp = urllib.robotparser.RobotFileParser() + rp.set_url(urljoin(domain, "/robots.txt")) + try: + rp.read() + except: + rp = None + self.rp_cache[domain] = rp + return rp and rp.can_fetch("*", url) + + def crawl_page(self): + while len(self.visited) < self.max_pages and not self.to_visit.empty(): + url = self.to_visit.get() + with self.lock: + if url in self.visited: + continue + self.visited.add(url) + + if not self.obey_robots(url): + continue + + try: + response = requests.get(url, timeout=6) + soup = BeautifulSoup(response.text, "html.parser") + text = soup.get_text().lower() + fulltext = ' '.join(text.split()) + except: + continue + + if self.keyword in text: + title = soup.title.string.strip() if soup.title else "No Title" + ip = self.get_ip(url) + if self.enable_portscan and ip not in ("Unresolvable", "Nicht auflΓΆsbar"): + open_ports = scan_ports(ip, self.ports_to_scan) + ports_str = ", ".join(str(p) for p in open_ports) if open_ports else "None" + else: + ports_str = "Not Scanned" + + with self.lock: + self.results.append({ + "title": title, + "url": url, + "ip": ip, + "keyword": self.keyword, + "fulltext": fulltext, + "open_ports": ports_str + }) + + for tag in soup.find_all("a", href=True): + next_url = urljoin(url, tag['href']) + if next_url.startswith("http") and urlparse(next_url).netloc: + with self.lock: + if next_url not in self.visited: + self.to_visit.put(next_url) + + def run(self): + threads = [] + for _ in range(self.num_threads): + t = threading.Thread(target=self.crawl_page) + t.start() + threads.append(t) + for t in threads: + t.join() + return self.results + +def save_csv(results, filename="crawler_results.csv"): + if not results: + print("⚠️ Nothing to save.") + return + fieldnames = list(results[0].keys()) + with open(filename, "w", newline="", encoding="utf-8") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + for row in results: + writer.writerow(row) \ No newline at end of file