Add files via upload
This commit is contained in:
10
Dockerfile
Normal file
10
Dockerfile
Normal file
@@ -0,0 +1,10 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY . .
|
||||
|
||||
CMD ["streamlit", "run", "crawler_gui.py", "--server.port=8501", "--server.enableCORS=false"]
|
||||
50
crawler_cli.py
Normal file
50
crawler_cli.py
Normal file
@@ -0,0 +1,50 @@
|
||||
from webcrawler import Crawler, save_csv
|
||||
import argparse
|
||||
import time
|
||||
from datetime import datetime
|
||||
|
||||
def parse_comma_list(value):
|
||||
return [v.strip() for v in value.split(",") if v.strip()]
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Headless Web Crawler with Fulltext + Portscan")
|
||||
parser.add_argument("urls", help="Comma-separated list of start URLs")
|
||||
parser.add_argument("keywords", help="Comma-separated list of keywords")
|
||||
parser.add_argument("-n", "--pages", type=int, default=20, help="Max pages per site")
|
||||
parser.add_argument("-t", "--threads", type=int, default=5, help="Number of threads")
|
||||
parser.add_argument("--portscan", action="store_true", help="Enable custom port scan")
|
||||
parser.add_argument("--ports", type=str, default="", help="Ports to scan (comma-separated)")
|
||||
parser.add_argument("-o", "--output", help="Output CSV filename (optional)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
urls = parse_comma_list(args.urls)
|
||||
keywords = parse_comma_list(args.keywords)
|
||||
ports = [int(p) for p in parse_comma_list(args.ports)] if args.portscan else []
|
||||
|
||||
all_results = []
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
output = args.output or f"crawl_{timestamp}.csv"
|
||||
|
||||
for url in urls:
|
||||
for keyword in keywords:
|
||||
print(f"\n🌐 Crawling '{url}' for keyword: '{keyword}'")
|
||||
crawler = Crawler(
|
||||
start_url=url,
|
||||
keyword=keyword,
|
||||
max_pages=args.pages,
|
||||
num_threads=args.threads,
|
||||
enable_portscan=args.portscan,
|
||||
ports_to_scan=ports
|
||||
)
|
||||
results = crawler.run()
|
||||
for r in results:
|
||||
r["keyword"] = keyword
|
||||
r["start_url"] = url
|
||||
all_results.extend(results)
|
||||
|
||||
if all_results:
|
||||
save_csv(all_results, output)
|
||||
print(f"\n✅ Done: {len(all_results)} results saved to '{output}'")
|
||||
else:
|
||||
print("❌ No matches found.")
|
||||
75
crawler_gui.py
Normal file
75
crawler_gui.py
Normal file
@@ -0,0 +1,75 @@
|
||||
import streamlit as st
|
||||
import pandas as pd
|
||||
import time
|
||||
from webcrawler import Crawler, save_csv
|
||||
from datetime import datetime
|
||||
|
||||
st.set_page_config(page_title="Multi-Site Web Crawler", layout="wide")
|
||||
st.title("🕷️ Web Crawler with Fulltext + Port Scanning")
|
||||
|
||||
# User input fields
|
||||
start_urls_input = st.text_area("Start URLs (comma-separated)", "https://example.com, https://example.org")
|
||||
keywords_input = st.text_input("Keywords (comma-separated)", "AI, privacy")
|
||||
|
||||
max_pages = st.slider("Maximum pages per site", 5, 1000, 200)
|
||||
threads = st.slider("Number of threads per crawl", 1, 20, 5)
|
||||
|
||||
enable_scan = st.checkbox("🔌 Enable custom port scan")
|
||||
port_input = st.text_input("Ports to scan (comma-separated)", "22,80,443,8080") if enable_scan else ""
|
||||
|
||||
# Session history
|
||||
if "search_history" not in st.session_state:
|
||||
st.session_state.search_history = []
|
||||
|
||||
if st.button("Start Crawling"):
|
||||
urls = [u.strip() for u in start_urls_input.split(",") if u.strip()]
|
||||
keywords = [k.strip() for k in keywords_input.split(",") if k.strip()]
|
||||
ports = [int(p.strip()) for p in port_input.split(",") if p.strip().isdigit()] if enable_scan else []
|
||||
|
||||
all_results = []
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
filename = f"crawl_{timestamp}.csv"
|
||||
|
||||
with st.spinner("Crawling… please wait"):
|
||||
for url in urls:
|
||||
for keyword in keywords:
|
||||
st.write(f"🔍 Crawling **{url}** for keyword '**{keyword}**'...")
|
||||
crawler = Crawler(
|
||||
url,
|
||||
keyword,
|
||||
max_pages=max_pages,
|
||||
num_threads=threads,
|
||||
enable_portscan=enable_scan,
|
||||
ports_to_scan=ports
|
||||
)
|
||||
results = crawler.run()
|
||||
for r in results:
|
||||
r["keyword"] = keyword
|
||||
r["start_url"] = url
|
||||
all_results.extend(results)
|
||||
|
||||
save_csv(all_results, filename)
|
||||
st.session_state.search_history.append({
|
||||
"timestamp": str(datetime.now()),
|
||||
"start_urls": urls,
|
||||
"keywords": keywords,
|
||||
"file": filename
|
||||
})
|
||||
|
||||
if all_results:
|
||||
df = pd.DataFrame(all_results)
|
||||
st.success(f"✅ {len(all_results)} results found.")
|
||||
st.dataframe(df)
|
||||
with open(filename, "rb") as f:
|
||||
st.download_button("📥 Download CSV", f, file_name=filename, mime="text/csv")
|
||||
else:
|
||||
st.warning("No matches found.")
|
||||
|
||||
# Sidebar search history
|
||||
st.sidebar.header("📁 Search History")
|
||||
for entry in reversed(st.session_state.search_history[-5:]):
|
||||
st.sidebar.write(f"🕓 {entry['timestamp']}")
|
||||
st.sidebar.write("Start URLs:", ", ".join(entry['start_urls']))
|
||||
st.sidebar.write("Keywords:", ", ".join(entry['keywords']))
|
||||
st.sidebar.write(f"📎 File: `{entry['file']}`")
|
||||
st.sidebar.markdown("---")
|
||||
23
docker-compose.yml
Normal file
23
docker-compose.yml
Normal file
@@ -0,0 +1,23 @@
|
||||
version: '3.9'
|
||||
|
||||
services:
|
||||
webcrawler_gui:
|
||||
build: .
|
||||
container_name: webcrawler_gui
|
||||
ports:
|
||||
- "8501:8501"
|
||||
volumes:
|
||||
- webcrawler_data:/app/output
|
||||
restart: unless-stopped
|
||||
command: streamlit run crawler_gui.py --server.port=8501 --server.enableCORS=false
|
||||
|
||||
webcrawler_cli:
|
||||
build: .
|
||||
container_name: webcrawler_cli
|
||||
entrypoint: ["python", "crawler_cli.py", "https://example.com", "AI,robots", "-n", "30", "-t", "6", "-o", "output/scan.csv"]
|
||||
volumes:
|
||||
- webcrawler_data:/app/output
|
||||
restart: "no"
|
||||
|
||||
volumes:
|
||||
webcrawler_data:
|
||||
87
readme.txt
Normal file
87
readme.txt
Normal file
@@ -0,0 +1,87 @@
|
||||
webcrawler_app/
|
||||
├── webcrawler.py ← the crawler logic (translated to English)
|
||||
├── crawler_gui.py ← the Streamlit user interface
|
||||
├── requirements.txt ← with all your dependencies
|
||||
|
||||
python -m venv venv
|
||||
.\venv\Scripts\activate # On Windows
|
||||
|
||||
|
||||
|
||||
|
||||
pip install -r requirements.txt
|
||||
|
||||
python -m pip install streamlit
|
||||
|
||||
python -m streamlit run crawler_gui.py
|
||||
|
||||
pyinstaller --noconfirm --onefile --windowed crawler_gui.py
|
||||
|
||||
|
||||
🗂️ Project Structure
|
||||
webcrawler_project/
|
||||
├── crawler_cli.py
|
||||
├── crawler_gui.py
|
||||
├── webcrawler.py
|
||||
├── requirements.txt
|
||||
├── Dockerfile
|
||||
└── docker-compose.yml
|
||||
|
||||
|
||||
- The gui service gives you a browser interface at http://localhost:8501
|
||||
- The cli service will run the crawler once and exit — perfect for automation
|
||||
|
||||
📦 How to Use It
|
||||
- Build & run all services:
|
||||
docker-compose up --build
|
||||
- Want to run only the GUI?
|
||||
docker-compose up gui
|
||||
- Want to run only the CLI job?
|
||||
docker-compose run --rm cli
|
||||
- Stop services:
|
||||
docker-compose down
|
||||
|
||||
|
||||
|
||||
📁 Saving Output Files
|
||||
By default, output CSVs are saved inside the container. If you'd like to access them from your host system, update your cli service in docker-compose.yml like this:
|
||||
volumes:
|
||||
- ./output:/app/output
|
||||
|
||||
|
||||
And update your crawler_cli.py to save files in output/filename.csv.
|
||||
|
||||
|
||||
🔧 Step 1: Prepare Your Project
|
||||
Make sure your project folder contains:
|
||||
crawler-app/
|
||||
├── crawler_cli.py
|
||||
├── crawler_gui.py
|
||||
├── webcrawler.py
|
||||
├── requirements.txt
|
||||
├── Dockerfile
|
||||
└── docker-compose.yml
|
||||
|
||||
|
||||
If you followed the setup I shared earlier, you're all set.
|
||||
|
||||
🚀 Step 2: Zip and Upload to Portainer
|
||||
- Compress your project folder into a .zip file on your local machine.
|
||||
- Log into Portainer.
|
||||
- Go to Stacks → Add Stack.
|
||||
- Give your stack a name (e.g. webcrawler).
|
||||
- In the Web editor, paste the contents of your docker-compose.yml.
|
||||
- Scroll down to “Advanced container settings” and upload the zipped project using the “Upload resources” option.
|
||||
- Click Deploy the stack.
|
||||
|
||||
🖥️ Step 3: Access Your Services
|
||||
- For the GUI, Portainer will expose port 8501 by default — access it via http://your-server-ip:8501.
|
||||
- The CLI crawler will run on startup and exit. You can re-run it anytime from the Containers view.
|
||||
|
||||
✅ Optional Enhancements
|
||||
- Mount a volume for persistent CSV output:
|
||||
volumes:
|
||||
- webcrawler_data:/app/output
|
||||
- Add environment variables or schedules via Portainer’s built-in options.
|
||||
|
||||
|
||||
6
requirements.txt
Normal file
6
requirements.txt
Normal file
@@ -0,0 +1,6 @@
|
||||
requests
|
||||
beautifulsoup4
|
||||
urllib3
|
||||
pandas
|
||||
streamlit
|
||||
dnspython
|
||||
121
webcrawler.py
Normal file
121
webcrawler.py
Normal file
@@ -0,0 +1,121 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urljoin, urlparse
|
||||
import socket
|
||||
import argparse
|
||||
import threading
|
||||
import queue
|
||||
import time
|
||||
import csv
|
||||
import urllib.robotparser
|
||||
|
||||
def scan_ports(ip, ports, timeout=1.0):
|
||||
open_ports = []
|
||||
for port in ports:
|
||||
try:
|
||||
with socket.create_connection((ip, port), timeout=timeout):
|
||||
open_ports.append(port)
|
||||
except:
|
||||
continue
|
||||
return open_ports
|
||||
|
||||
class Crawler:
|
||||
def __init__(self, start_url, keyword, max_pages=20, num_threads=5, enable_portscan=False, ports_to_scan=None):
|
||||
self.start_url = start_url
|
||||
self.keyword = keyword.lower()
|
||||
self.max_pages = max_pages
|
||||
self.num_threads = num_threads
|
||||
self.enable_portscan = enable_portscan
|
||||
self.ports_to_scan = ports_to_scan if ports_to_scan else []
|
||||
self.visited = set()
|
||||
self.to_visit = queue.Queue()
|
||||
self.to_visit.put(start_url)
|
||||
self.lock = threading.Lock()
|
||||
self.results = []
|
||||
self.rp_cache = {}
|
||||
|
||||
def get_ip(self, url):
|
||||
try:
|
||||
return socket.gethostbyname(urlparse(url).hostname)
|
||||
except:
|
||||
return "Unresolvable"
|
||||
|
||||
def obey_robots(self, url):
|
||||
domain = f"{urlparse(url).scheme}://{urlparse(url).netloc}"
|
||||
if domain in self.rp_cache:
|
||||
rp = self.rp_cache[domain]
|
||||
else:
|
||||
rp = urllib.robotparser.RobotFileParser()
|
||||
rp.set_url(urljoin(domain, "/robots.txt"))
|
||||
try:
|
||||
rp.read()
|
||||
except:
|
||||
rp = None
|
||||
self.rp_cache[domain] = rp
|
||||
return rp and rp.can_fetch("*", url)
|
||||
|
||||
def crawl_page(self):
|
||||
while len(self.visited) < self.max_pages and not self.to_visit.empty():
|
||||
url = self.to_visit.get()
|
||||
with self.lock:
|
||||
if url in self.visited:
|
||||
continue
|
||||
self.visited.add(url)
|
||||
|
||||
if not self.obey_robots(url):
|
||||
continue
|
||||
|
||||
try:
|
||||
response = requests.get(url, timeout=6)
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
text = soup.get_text().lower()
|
||||
fulltext = ' '.join(text.split())
|
||||
except:
|
||||
continue
|
||||
|
||||
if self.keyword in text:
|
||||
title = soup.title.string.strip() if soup.title else "No Title"
|
||||
ip = self.get_ip(url)
|
||||
if self.enable_portscan and ip not in ("Unresolvable", "Nicht auflösbar"):
|
||||
open_ports = scan_ports(ip, self.ports_to_scan)
|
||||
ports_str = ", ".join(str(p) for p in open_ports) if open_ports else "None"
|
||||
else:
|
||||
ports_str = "Not Scanned"
|
||||
|
||||
with self.lock:
|
||||
self.results.append({
|
||||
"title": title,
|
||||
"url": url,
|
||||
"ip": ip,
|
||||
"keyword": self.keyword,
|
||||
"fulltext": fulltext,
|
||||
"open_ports": ports_str
|
||||
})
|
||||
|
||||
for tag in soup.find_all("a", href=True):
|
||||
next_url = urljoin(url, tag['href'])
|
||||
if next_url.startswith("http") and urlparse(next_url).netloc:
|
||||
with self.lock:
|
||||
if next_url not in self.visited:
|
||||
self.to_visit.put(next_url)
|
||||
|
||||
def run(self):
|
||||
threads = []
|
||||
for _ in range(self.num_threads):
|
||||
t = threading.Thread(target=self.crawl_page)
|
||||
t.start()
|
||||
threads.append(t)
|
||||
for t in threads:
|
||||
t.join()
|
||||
return self.results
|
||||
|
||||
def save_csv(results, filename="crawler_results.csv"):
|
||||
if not results:
|
||||
print("⚠️ Nothing to save.")
|
||||
return
|
||||
fieldnames = list(results[0].keys())
|
||||
with open(filename, "w", newline="", encoding="utf-8") as f:
|
||||
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
for row in results:
|
||||
writer.writerow(row)
|
||||
Reference in New Issue
Block a user