Files
website-crawler/crawler_cli.py
2025-06-26 00:48:28 +02:00

50 lines
2.0 KiB
Python

from webcrawler import Crawler, save_csv
import argparse
import time
from datetime import datetime
def parse_comma_list(value):
return [v.strip() for v in value.split(",") if v.strip()]
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Headless Web Crawler with Fulltext + Portscan")
parser.add_argument("urls", help="Comma-separated list of start URLs")
parser.add_argument("keywords", help="Comma-separated list of keywords")
parser.add_argument("-n", "--pages", type=int, default=20, help="Max pages per site")
parser.add_argument("-t", "--threads", type=int, default=5, help="Number of threads")
parser.add_argument("--portscan", action="store_true", help="Enable custom port scan")
parser.add_argument("--ports", type=str, default="", help="Ports to scan (comma-separated)")
parser.add_argument("-o", "--output", help="Output CSV filename (optional)")
args = parser.parse_args()
urls = parse_comma_list(args.urls)
keywords = parse_comma_list(args.keywords)
ports = [int(p) for p in parse_comma_list(args.ports)] if args.portscan else []
all_results = []
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output = args.output or f"crawl_{timestamp}.csv"
for url in urls:
for keyword in keywords:
print(f"\n🌐 Crawling '{url}' for keyword: '{keyword}'")
crawler = Crawler(
start_url=url,
keyword=keyword,
max_pages=args.pages,
num_threads=args.threads,
enable_portscan=args.portscan,
ports_to_scan=ports
)
results = crawler.run()
for r in results:
r["keyword"] = keyword
r["start_url"] = url
all_results.extend(results)
if all_results:
save_csv(all_results, output)
print(f"\n✅ Done: {len(all_results)} results saved to '{output}'")
else:
print("❌ No matches found.")