Add files via upload

2025-06-26 00:48:28 +02:00
parent 06ba6eff6f
commit 1299d5a097
7 changed files with 372 additions and 0 deletions
--- a/10
+++ b/10
@@ -0,0 +1,10 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY . .
+
+CMD ["streamlit", "run", "crawler_gui.py", "--server.port=8501", "--server.enableCORS=false"]
--- a/crawler_cli.py
+++ b/crawler_cli.py
@@ -0,0 +1,50 @@
+from webcrawler import Crawler, save_csv
+import argparse
+import time
+from datetime import datetime
+
+def parse_comma_list(value):
+    return [v.strip() for v in value.split(",") if v.strip()]
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Headless Web Crawler with Fulltext + Portscan")
+    parser.add_argument("urls", help="Comma-separated list of start URLs")
+    parser.add_argument("keywords", help="Comma-separated list of keywords")
+    parser.add_argument("-n", "--pages", type=int, default=20, help="Max pages per site")
+    parser.add_argument("-t", "--threads", type=int, default=5, help="Number of threads")
+    parser.add_argument("--portscan", action="store_true", help="Enable custom port scan")
+    parser.add_argument("--ports", type=str, default="", help="Ports to scan (comma-separated)")
+    parser.add_argument("-o", "--output", help="Output CSV filename (optional)")
+
+    args = parser.parse_args()
+
+    urls = parse_comma_list(args.urls)
+    keywords = parse_comma_list(args.keywords)
+    ports = [int(p) for p in parse_comma_list(args.ports)] if args.portscan else []
+
+    all_results = []
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    output = args.output or f"crawl_{timestamp}.csv"
+
+    for url in urls:
+        for keyword in keywords:
+            print(f"\n🌐 Crawling '{url}' for keyword: '{keyword}'")
+            crawler = Crawler(
+                start_url=url,
+                keyword=keyword,
+                max_pages=args.pages,
+                num_threads=args.threads,
+                enable_portscan=args.portscan,
+                ports_to_scan=ports
+            )
+            results = crawler.run()
+            for r in results:
+                r["keyword"] = keyword
+                r["start_url"] = url
+            all_results.extend(results)
+
+    if all_results:
+        save_csv(all_results, output)
+        print(f"\n✅ Done: {len(all_results)} results saved to '{output}'")
+    else:
+        print("❌ No matches found.")
--- a/crawler_gui.py
+++ b/crawler_gui.py
@@ -0,0 +1,75 @@
+import streamlit as st
+import pandas as pd
+import time
+from webcrawler import Crawler, save_csv
+from datetime import datetime
+
+st.set_page_config(page_title="Multi-Site Web Crawler", layout="wide")
+st.title("🕷️ Web Crawler with Fulltext + Port Scanning")
+
+# User input fields
+start_urls_input = st.text_area("Start URLs (comma-separated)", "https://example.com, https://example.org")
+keywords_input = st.text_input("Keywords (comma-separated)", "AI, privacy")
+
+max_pages = st.slider("Maximum pages per site", 5, 1000, 200)
+threads = st.slider("Number of threads per crawl", 1, 20, 5)
+
+enable_scan = st.checkbox("🔌 Enable custom port scan")
+port_input = st.text_input("Ports to scan (comma-separated)", "22,80,443,8080") if enable_scan else ""
+
+# Session history
+if "search_history" not in st.session_state:
+    st.session_state.search_history = []
+
+if st.button("Start Crawling"):
+    urls = [u.strip() for u in start_urls_input.split(",") if u.strip()]
+    keywords = [k.strip() for k in keywords_input.split(",") if k.strip()]
+    ports = [int(p.strip()) for p in port_input.split(",") if p.strip().isdigit()] if enable_scan else []
+
+    all_results = []
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    filename = f"crawl_{timestamp}.csv"
+
+    with st.spinner("Crawling… please wait"):
+        for url in urls:
+            for keyword in keywords:
+                st.write(f"🔍 Crawling **{url}** for keyword '**{keyword}**'...")
+                crawler = Crawler(
+                    url,
+                    keyword,
+                    max_pages=max_pages,
+                    num_threads=threads,
+                    enable_portscan=enable_scan,
+                    ports_to_scan=ports
+                )
+                results = crawler.run()
+                for r in results:
+                    r["keyword"] = keyword
+                    r["start_url"] = url
+                all_results.extend(results)
+
+        save_csv(all_results, filename)
+        st.session_state.search_history.append({
+            "timestamp": str(datetime.now()),
+            "start_urls": urls,
+            "keywords": keywords,
+            "file": filename
+        })
+
+    if all_results:
+        df = pd.DataFrame(all_results)
+        st.success(f"✅ {len(all_results)} results found.")
+        st.dataframe(df)
+        with open(filename, "rb") as f:
+            st.download_button("📥 Download CSV", f, file_name=filename, mime="text/csv")
+    else:
+        st.warning("No matches found.")
+
+# Sidebar search history
+st.sidebar.header("📁 Search History")
+for entry in reversed(st.session_state.search_history[-5:]):
+    st.sidebar.write(f"🕓 {entry['timestamp']}")
+    st.sidebar.write("Start URLs:", ", ".join(entry['start_urls']))
+    st.sidebar.write("Keywords:", ", ".join(entry['keywords']))
+    st.sidebar.write(f"📎 File: `{entry['file']}`")
+    st.sidebar.markdown("---")
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,23 @@
+version: '3.9'
+
+services:
+  webcrawler_gui:
+    build: .
+    container_name: webcrawler_gui
+    ports:
+      - "8501:8501"
+    volumes:
+      - webcrawler_data:/app/output
+    restart: unless-stopped
+    command: streamlit run crawler_gui.py --server.port=8501 --server.enableCORS=false
+
+  webcrawler_cli:
+    build: .
+    container_name: webcrawler_cli
+    entrypoint: ["python", "crawler_cli.py", "https://example.com", "AI,robots", "-n", "30", "-t", "6", "-o", "output/scan.csv"]
+    volumes:
+      - webcrawler_data:/app/output
+    restart: "no"
+
+volumes:
+  webcrawler_data:
--- a/readme.txt
+++ b/readme.txt
@@ -0,0 +1,87 @@
+webcrawler_app/
+├── webcrawler.py        ← the crawler logic (translated to English)
+├── crawler_gui.py       ← the Streamlit user interface
+├── requirements.txt     ← with all your dependencies
+
+python -m venv venv
+.\venv\Scripts\activate     # On Windows
+
+
+
+
+pip install -r requirements.txt
+
+python -m pip install streamlit
+
+python -m streamlit run crawler_gui.py
+
+pyinstaller --noconfirm --onefile --windowed crawler_gui.py
+
+
+🗂️ Project Structure
+webcrawler_project/
+├── crawler_cli.py
+├── crawler_gui.py
+├── webcrawler.py
+├── requirements.txt
+├── Dockerfile
+└── docker-compose.yml
+
+
+- The gui service gives you a browser interface at http://localhost:8501
+- The cli service will run the crawler once and exit — perfect for automation
+
+📦 How to Use It
+- Build & run all services:
+docker-compose up --build
+- Want to run only the GUI?
+docker-compose up gui
+- Want to run only the CLI job?
+docker-compose run --rm cli
+- Stop services:
+docker-compose down
+
+
+
+📁 Saving Output Files
+By default, output CSVs are saved inside the container. If you'd like to access them from your host system, update your cli service in docker-compose.yml like this:
+    volumes:
+      - ./output:/app/output
+
+
+And update your crawler_cli.py to save files in output/filename.csv.
+
+
+🔧 Step 1: Prepare Your Project
+Make sure your project folder contains:
+crawler-app/
+├── crawler_cli.py
+├── crawler_gui.py
+├── webcrawler.py
+├── requirements.txt
+├── Dockerfile
+└── docker-compose.yml
+
+
+If you followed the setup I shared earlier, you're all set.
+
+🚀 Step 2: Zip and Upload to Portainer
+- Compress your project folder into a .zip file on your local machine.
+- Log into Portainer.
+- Go to Stacks → Add Stack.
+- Give your stack a name (e.g. webcrawler).
+- In the Web editor, paste the contents of your docker-compose.yml.
+- Scroll down to “Advanced container settings” and upload the zipped project using the “Upload resources” option.
+- Click Deploy the stack.
+
+🖥️ Step 3: Access Your Services
+- For the GUI, Portainer will expose port 8501 by default — access it via http://your-server-ip:8501.
+- The CLI crawler will run on startup and exit. You can re-run it anytime from the Containers view.
+
+✅ Optional Enhancements
+- Mount a volume for persistent CSV output:
+volumes:
+  - webcrawler_data:/app/output
+- Add environment variables or schedules via Portainer’s built-in options.
+
+
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,6 @@
+requests
+beautifulsoup4
+urllib3
+pandas
+streamlit
+dnspython
--- a/webcrawler.py
+++ b/webcrawler.py
@@ -0,0 +1,121 @@
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin, urlparse
+import socket
+import argparse
+import threading
+import queue
+import time
+import csv
+import urllib.robotparser
+
+def scan_ports(ip, ports, timeout=1.0):
+    open_ports = []
+    for port in ports:
+        try:
+            with socket.create_connection((ip, port), timeout=timeout):
+                open_ports.append(port)
+        except:
+            continue
+    return open_ports
+
+class Crawler:
+    def __init__(self, start_url, keyword, max_pages=20, num_threads=5, enable_portscan=False, ports_to_scan=None):
+        self.start_url = start_url
+        self.keyword = keyword.lower()
+        self.max_pages = max_pages
+        self.num_threads = num_threads
+        self.enable_portscan = enable_portscan
+        self.ports_to_scan = ports_to_scan if ports_to_scan else []
+        self.visited = set()
+        self.to_visit = queue.Queue()
+        self.to_visit.put(start_url)
+        self.lock = threading.Lock()
+        self.results = []
+        self.rp_cache = {}
+
+    def get_ip(self, url):
+        try:
+            return socket.gethostbyname(urlparse(url).hostname)
+        except:
+            return "Unresolvable"
+
+    def obey_robots(self, url):
+        domain = f"{urlparse(url).scheme}://{urlparse(url).netloc}"
+        if domain in self.rp_cache:
+            rp = self.rp_cache[domain]
+        else:
+            rp = urllib.robotparser.RobotFileParser()
+            rp.set_url(urljoin(domain, "/robots.txt"))
+            try:
+                rp.read()
+            except:
+                rp = None
+            self.rp_cache[domain] = rp
+        return rp and rp.can_fetch("*", url)
+
+    def crawl_page(self):
+        while len(self.visited) < self.max_pages and not self.to_visit.empty():
+            url = self.to_visit.get()
+            with self.lock:
+                if url in self.visited:
+                    continue
+                self.visited.add(url)
+
+            if not self.obey_robots(url):
+                continue
+
+            try:
+                response = requests.get(url, timeout=6)
+                soup = BeautifulSoup(response.text, "html.parser")
+                text = soup.get_text().lower()
+                fulltext = ' '.join(text.split())
+            except:
+                continue
+
+            if self.keyword in text:
+                title = soup.title.string.strip() if soup.title else "No Title"
+                ip = self.get_ip(url)
+                if self.enable_portscan and ip not in ("Unresolvable", "Nicht auflösbar"):
+                    open_ports = scan_ports(ip, self.ports_to_scan)
+                    ports_str = ", ".join(str(p) for p in open_ports) if open_ports else "None"
+                else:
+                    ports_str = "Not Scanned"
+
+                with self.lock:
+                    self.results.append({
+                        "title": title,
+                        "url": url,
+                        "ip": ip,
+                        "keyword": self.keyword,
+                        "fulltext": fulltext,
+                        "open_ports": ports_str
+                    })
+
+            for tag in soup.find_all("a", href=True):
+                next_url = urljoin(url, tag['href'])
+                if next_url.startswith("http") and urlparse(next_url).netloc:
+                    with self.lock:
+                        if next_url not in self.visited:
+                            self.to_visit.put(next_url)
+
+    def run(self):
+        threads = []
+        for _ in range(self.num_threads):
+            t = threading.Thread(target=self.crawl_page)
+            t.start()
+            threads.append(t)
+        for t in threads:
+            t.join()
+        return self.results
+
+def save_csv(results, filename="crawler_results.csv"):
+    if not results:
+        print("⚠️ Nothing to save.")
+        return
+    fieldnames = list(results[0].keys())
+    with open(filename, "w", newline="", encoding="utf-8") as f:
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        writer.writeheader()
+        for row in results:
+            writer.writerow(row)