Add files via upload

This commit is contained in:
DerGamerPanda
2025-06-26 00:48:28 +02:00
committed by GitHub
parent 06ba6eff6f
commit 1299d5a097
7 changed files with 372 additions and 0 deletions

10
Dockerfile Normal file
View File

@@ -0,0 +1,10 @@
FROM python:3.11-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
CMD ["streamlit", "run", "crawler_gui.py", "--server.port=8501", "--server.enableCORS=false"]

50
crawler_cli.py Normal file
View File

@@ -0,0 +1,50 @@
from webcrawler import Crawler, save_csv
import argparse
import time
from datetime import datetime
def parse_comma_list(value):
return [v.strip() for v in value.split(",") if v.strip()]
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Headless Web Crawler with Fulltext + Portscan")
parser.add_argument("urls", help="Comma-separated list of start URLs")
parser.add_argument("keywords", help="Comma-separated list of keywords")
parser.add_argument("-n", "--pages", type=int, default=20, help="Max pages per site")
parser.add_argument("-t", "--threads", type=int, default=5, help="Number of threads")
parser.add_argument("--portscan", action="store_true", help="Enable custom port scan")
parser.add_argument("--ports", type=str, default="", help="Ports to scan (comma-separated)")
parser.add_argument("-o", "--output", help="Output CSV filename (optional)")
args = parser.parse_args()
urls = parse_comma_list(args.urls)
keywords = parse_comma_list(args.keywords)
ports = [int(p) for p in parse_comma_list(args.ports)] if args.portscan else []
all_results = []
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output = args.output or f"crawl_{timestamp}.csv"
for url in urls:
for keyword in keywords:
print(f"\n🌐 Crawling '{url}' for keyword: '{keyword}'")
crawler = Crawler(
start_url=url,
keyword=keyword,
max_pages=args.pages,
num_threads=args.threads,
enable_portscan=args.portscan,
ports_to_scan=ports
)
results = crawler.run()
for r in results:
r["keyword"] = keyword
r["start_url"] = url
all_results.extend(results)
if all_results:
save_csv(all_results, output)
print(f"\n✅ Done: {len(all_results)} results saved to '{output}'")
else:
print("❌ No matches found.")

75
crawler_gui.py Normal file
View File

@@ -0,0 +1,75 @@
import streamlit as st
import pandas as pd
import time
from webcrawler import Crawler, save_csv
from datetime import datetime
st.set_page_config(page_title="Multi-Site Web Crawler", layout="wide")
st.title("🕷️ Web Crawler with Fulltext + Port Scanning")
# User input fields
start_urls_input = st.text_area("Start URLs (comma-separated)", "https://example.com, https://example.org")
keywords_input = st.text_input("Keywords (comma-separated)", "AI, privacy")
max_pages = st.slider("Maximum pages per site", 5, 1000, 200)
threads = st.slider("Number of threads per crawl", 1, 20, 5)
enable_scan = st.checkbox("🔌 Enable custom port scan")
port_input = st.text_input("Ports to scan (comma-separated)", "22,80,443,8080") if enable_scan else ""
# Session history
if "search_history" not in st.session_state:
st.session_state.search_history = []
if st.button("Start Crawling"):
urls = [u.strip() for u in start_urls_input.split(",") if u.strip()]
keywords = [k.strip() for k in keywords_input.split(",") if k.strip()]
ports = [int(p.strip()) for p in port_input.split(",") if p.strip().isdigit()] if enable_scan else []
all_results = []
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"crawl_{timestamp}.csv"
with st.spinner("Crawling… please wait"):
for url in urls:
for keyword in keywords:
st.write(f"🔍 Crawling **{url}** for keyword '**{keyword}**'...")
crawler = Crawler(
url,
keyword,
max_pages=max_pages,
num_threads=threads,
enable_portscan=enable_scan,
ports_to_scan=ports
)
results = crawler.run()
for r in results:
r["keyword"] = keyword
r["start_url"] = url
all_results.extend(results)
save_csv(all_results, filename)
st.session_state.search_history.append({
"timestamp": str(datetime.now()),
"start_urls": urls,
"keywords": keywords,
"file": filename
})
if all_results:
df = pd.DataFrame(all_results)
st.success(f"{len(all_results)} results found.")
st.dataframe(df)
with open(filename, "rb") as f:
st.download_button("📥 Download CSV", f, file_name=filename, mime="text/csv")
else:
st.warning("No matches found.")
# Sidebar search history
st.sidebar.header("📁 Search History")
for entry in reversed(st.session_state.search_history[-5:]):
st.sidebar.write(f"🕓 {entry['timestamp']}")
st.sidebar.write("Start URLs:", ", ".join(entry['start_urls']))
st.sidebar.write("Keywords:", ", ".join(entry['keywords']))
st.sidebar.write(f"📎 File: `{entry['file']}`")
st.sidebar.markdown("---")

23
docker-compose.yml Normal file
View File

@@ -0,0 +1,23 @@
version: '3.9'
services:
webcrawler_gui:
build: .
container_name: webcrawler_gui
ports:
- "8501:8501"
volumes:
- webcrawler_data:/app/output
restart: unless-stopped
command: streamlit run crawler_gui.py --server.port=8501 --server.enableCORS=false
webcrawler_cli:
build: .
container_name: webcrawler_cli
entrypoint: ["python", "crawler_cli.py", "https://example.com", "AI,robots", "-n", "30", "-t", "6", "-o", "output/scan.csv"]
volumes:
- webcrawler_data:/app/output
restart: "no"
volumes:
webcrawler_data:

87
readme.txt Normal file
View File

@@ -0,0 +1,87 @@
webcrawler_app/
├── webcrawler.py ← the crawler logic (translated to English)
├── crawler_gui.py ← the Streamlit user interface
├── requirements.txt ← with all your dependencies
python -m venv venv
.\venv\Scripts\activate # On Windows
pip install -r requirements.txt
python -m pip install streamlit
python -m streamlit run crawler_gui.py
pyinstaller --noconfirm --onefile --windowed crawler_gui.py
🗂️ Project Structure
webcrawler_project/
├── crawler_cli.py
├── crawler_gui.py
├── webcrawler.py
├── requirements.txt
├── Dockerfile
└── docker-compose.yml
- The gui service gives you a browser interface at http://localhost:8501
- The cli service will run the crawler once and exit — perfect for automation
📦 How to Use It
- Build & run all services:
docker-compose up --build
- Want to run only the GUI?
docker-compose up gui
- Want to run only the CLI job?
docker-compose run --rm cli
- Stop services:
docker-compose down
📁 Saving Output Files
By default, output CSVs are saved inside the container. If you'd like to access them from your host system, update your cli service in docker-compose.yml like this:
volumes:
- ./output:/app/output
And update your crawler_cli.py to save files in output/filename.csv.
🔧 Step 1: Prepare Your Project
Make sure your project folder contains:
crawler-app/
├── crawler_cli.py
├── crawler_gui.py
├── webcrawler.py
├── requirements.txt
├── Dockerfile
└── docker-compose.yml
If you followed the setup I shared earlier, you're all set.
🚀 Step 2: Zip and Upload to Portainer
- Compress your project folder into a .zip file on your local machine.
- Log into Portainer.
- Go to Stacks → Add Stack.
- Give your stack a name (e.g. webcrawler).
- In the Web editor, paste the contents of your docker-compose.yml.
- Scroll down to “Advanced container settings” and upload the zipped project using the “Upload resources” option.
- Click Deploy the stack.
🖥️ Step 3: Access Your Services
- For the GUI, Portainer will expose port 8501 by default — access it via http://your-server-ip:8501.
- The CLI crawler will run on startup and exit. You can re-run it anytime from the Containers view.
✅ Optional Enhancements
- Mount a volume for persistent CSV output:
volumes:
- webcrawler_data:/app/output
- Add environment variables or schedules via Portainers built-in options.

6
requirements.txt Normal file
View File

@@ -0,0 +1,6 @@
requests
beautifulsoup4
urllib3
pandas
streamlit
dnspython

121
webcrawler.py Normal file
View File

@@ -0,0 +1,121 @@
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import socket
import argparse
import threading
import queue
import time
import csv
import urllib.robotparser
def scan_ports(ip, ports, timeout=1.0):
open_ports = []
for port in ports:
try:
with socket.create_connection((ip, port), timeout=timeout):
open_ports.append(port)
except:
continue
return open_ports
class Crawler:
def __init__(self, start_url, keyword, max_pages=20, num_threads=5, enable_portscan=False, ports_to_scan=None):
self.start_url = start_url
self.keyword = keyword.lower()
self.max_pages = max_pages
self.num_threads = num_threads
self.enable_portscan = enable_portscan
self.ports_to_scan = ports_to_scan if ports_to_scan else []
self.visited = set()
self.to_visit = queue.Queue()
self.to_visit.put(start_url)
self.lock = threading.Lock()
self.results = []
self.rp_cache = {}
def get_ip(self, url):
try:
return socket.gethostbyname(urlparse(url).hostname)
except:
return "Unresolvable"
def obey_robots(self, url):
domain = f"{urlparse(url).scheme}://{urlparse(url).netloc}"
if domain in self.rp_cache:
rp = self.rp_cache[domain]
else:
rp = urllib.robotparser.RobotFileParser()
rp.set_url(urljoin(domain, "/robots.txt"))
try:
rp.read()
except:
rp = None
self.rp_cache[domain] = rp
return rp and rp.can_fetch("*", url)
def crawl_page(self):
while len(self.visited) < self.max_pages and not self.to_visit.empty():
url = self.to_visit.get()
with self.lock:
if url in self.visited:
continue
self.visited.add(url)
if not self.obey_robots(url):
continue
try:
response = requests.get(url, timeout=6)
soup = BeautifulSoup(response.text, "html.parser")
text = soup.get_text().lower()
fulltext = ' '.join(text.split())
except:
continue
if self.keyword in text:
title = soup.title.string.strip() if soup.title else "No Title"
ip = self.get_ip(url)
if self.enable_portscan and ip not in ("Unresolvable", "Nicht auflösbar"):
open_ports = scan_ports(ip, self.ports_to_scan)
ports_str = ", ".join(str(p) for p in open_ports) if open_ports else "None"
else:
ports_str = "Not Scanned"
with self.lock:
self.results.append({
"title": title,
"url": url,
"ip": ip,
"keyword": self.keyword,
"fulltext": fulltext,
"open_ports": ports_str
})
for tag in soup.find_all("a", href=True):
next_url = urljoin(url, tag['href'])
if next_url.startswith("http") and urlparse(next_url).netloc:
with self.lock:
if next_url not in self.visited:
self.to_visit.put(next_url)
def run(self):
threads = []
for _ in range(self.num_threads):
t = threading.Thread(target=self.crawl_page)
t.start()
threads.append(t)
for t in threads:
t.join()
return self.results
def save_csv(results, filename="crawler_results.csv"):
if not results:
print("⚠️ Nothing to save.")
return
fieldnames = list(results[0].keys())
with open(filename, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for row in results:
writer.writerow(row)