Files
website-crawler/webcrawler.py
2025-06-26 12:51:41 +02:00

158 lines
6.2 KiB
Python

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import socket
import threading
import queue
import time
import csv
import os
DOCUMENT_EXTENSIONS = [".pdf", ".txt", ".json", ".doc", ".docx", ".csv", ".xls", ".xlsx", ".ppt", ".pptx", ".html", ".htm"]
def scan_ports(ip, ports, timeout=1.0):
open_ports = []
for port in ports:
try:
with socket.create_connection((ip, port), timeout=timeout):
open_ports.append(port)
except:
continue
return open_ports
def extract_filename_from_url(url):
return os.path.basename(urlparse(url).path)
def fetch_text_from_file(url, filetype, timeout=5):
try:
r = requests.get(url, timeout=timeout)
if filetype in [".txt", ".json"]:
return r.text[:5000].strip()
except:
return ""
return ""
class Crawler:
def __init__(self, start_url, keyword, max_pages=20, num_threads=5,
enable_portscan=False, ports_to_scan=None):
self.start_url = start_url
self.keyword = keyword.lower()
self.max_pages = max_pages
self.num_threads = num_threads
self.enable_portscan = enable_portscan
self.ports_to_scan = ports_to_scan if ports_to_scan else []
self.visited = set()
self.to_visit = queue.Queue()
self.to_visit.put(start_url)
self.lock = threading.Lock()
self.results = []
self.active_thread_objects = []
self.active_threads = 0
def get_ip(self, url):
try:
return socket.gethostbyname(urlparse(url).hostname)
except:
return "Unresolvable"
def obey_robots(self, url):
return True # Ignoring robots.txt on purpose
def crawl_page(self):
with self.lock:
self.active_threads += 1
try:
while len(self.visited) < self.max_pages and not self.to_visit.empty():
url = self.to_visit.get()
with self.lock:
if url in self.visited:
continue
self.visited.add(url)
if not self.obey_robots(url):
continue
try:
response = requests.get(url, timeout=6)
soup = BeautifulSoup(response.text, "html.parser")
text = soup.get_text().lower()
fulltext = ' '.join(text.split())
except:
continue
if self.keyword in text:
title = soup.title.string.strip() if soup.title else "No Title"
ip = self.get_ip(url)
ports_str = "Not Scanned"
if self.enable_portscan and ip != "Unresolvable":
open_ports = scan_ports(ip, self.ports_to_scan)
ports_str = ", ".join(str(p) for p in open_ports) if open_ports else "None"
preview = fulltext[:300] + "..." if len(fulltext) > 300 else fulltext
with self.lock:
self.results.append({
"title": title,
"url": url,
"ip": ip,
"keyword": self.keyword,
"fulltext": fulltext,
"open_ports": ports_str,
"file_url": "",
"file_name": "",
"file_type": "",
"matched_content": "",
"preview": preview
})
for tag in soup.find_all("a", href=True):
href = tag["href"]
next_url = urljoin(url, href)
if next_url.startswith("http") and urlparse(next_url).netloc:
ext = os.path.splitext(urlparse(next_url).path)[1].lower()
if ext in DOCUMENT_EXTENSIONS:
filename = extract_filename_from_url(next_url)
filetext = fetch_text_from_file(next_url, ext)
if self.keyword in filetext.lower():
preview = filetext[:300] + "..." if len(filetext) > 300 else filetext
with self.lock:
self.results.append({
"title": "Linked File",
"url": url,
"ip": self.get_ip(url),
"keyword": self.keyword,
"fulltext": "",
"open_ports": "",
"file_url": next_url,
"file_name": filename,
"file_type": ext.lstrip("."),
"matched_content": filetext[:1000],
"preview": preview
})
else:
with self.lock:
if next_url not in self.visited:
self.to_visit.put(next_url)
finally:
with self.lock:
self.active_threads -= 1
def run(self):
self.active_thread_objects = []
for _ in range(self.num_threads):
t = threading.Thread(target=self.crawl_page)
t.start()
self.active_thread_objects.append(t)
for t in self.active_thread_objects:
t.join()
return self.results
def save_csv(results, filename="crawler_results.csv"):
if not results:
print("⚠️ Nothing to save.")
return
fieldnames = list(results[0].keys())
with open(filename, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for row in results:
writer.writerow(row)