Update crawler_gui.py

This commit is contained in:
DerGamerPanda
2025-06-26 12:52:05 +02:00
committed by GitHub
parent d00fca79ed
commit c63ab23332

View File

@@ -1,53 +1,83 @@
import streamlit as st
import pandas as pd
import time
from webcrawler import Crawler, save_csv
from datetime import datetime
from webcrawler import Crawler, save_csv
from urllib.parse import urlparse
import time
import altair as alt
st.set_page_config(page_title="Multi-Site Web Crawler", layout="wide")
st.title("🕷️ Web Crawler with Fulltext + Port Scanning")
# User input fields
start_urls_input = st.text_area("Start URLs (comma-separated)", "https://example.com, https://example.org")
keywords_input = st.text_input("Keywords (comma-separated)", "AI, privacy")
max_pages = st.slider("Maximum pages per site", 5, 1000, 200)
threads = st.slider("Number of threads per crawl", 1, 20, 5)
enable_scan = st.checkbox("🔌 Enable custom port scan")
port_input = st.text_input("Ports to scan (comma-separated)", "22,80,443,8080") if enable_scan else ""
# Session history
# Session init
if "search_history" not in st.session_state:
st.session_state.search_history = []
if st.button("Start Crawling"):
# Page setup
st.set_page_config(page_title="Web Crawler Pro", layout="wide")
st.title("🕷️ Web Crawler with File Search, Port Scan, Previews & Analytics")
# Inputs
start_urls_input = st.text_area("🌍 Start URLs (comma-separated)", "https://example.com, https://example.org")
keywords_input = st.text_input("🔎 Keywords (comma-separated)", "AI, privacy")
max_pages = st.slider("📄 Max pages per site", 5, 100, 20)
threads = st.slider("🧵 Threads per crawl", 1, 20, 5)
enable_scan = st.checkbox("🔌 Enable custom port scan")
port_input = st.text_input("Ports to scan (comma-separated)", "22,80,443") if enable_scan else ""
show_only_files = st.checkbox("📄 Show only file matches", value=False)
show_chart = st.checkbox("📊 Show keyword match chart", value=True)
show_domains = st.checkbox("🌐 Show domain stats", value=True)
start = st.button("🚀 Start Crawling")
log_display = st.empty()
thread_display = st.empty()
progress_display = st.empty()
if start:
urls = [u.strip() for u in start_urls_input.split(",") if u.strip()]
keywords = [k.strip() for k in keywords_input.split(",") if k.strip()]
ports = [int(p.strip()) for p in port_input.split(",") if p.strip().isdigit()] if enable_scan else []
all_results = []
total_jobs = len(urls) * len(keywords)
job_index = 0
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"crawl_{timestamp}.csv"
with st.spinner("Crawling… please wait"):
with st.spinner("Crawling in progress…"):
for url in urls:
for keyword in keywords:
st.write(f"🔍 Crawling **{url}** for keyword '**{keyword}**'...")
log_display.markdown(f"🔍 Crawling **{url}** for '**{keyword}**'...")
crawler = Crawler(
url,
keyword,
start_url=url,
keyword=keyword,
max_pages=max_pages,
num_threads=threads,
enable_portscan=enable_scan,
ports_to_scan=ports
)
thread_display.markdown(f"🧵 Threads active: `{threads}`")
progress_display.progress(0.0, text=f"Scanning {url}")
results = crawler.run()
while any(t.is_alive() for t in crawler.active_thread_objects):
time.sleep(0.2)
ratio = min(len(crawler.visited) / max_pages, 1.0)
progress_display.progress(ratio, text=f"{int(ratio*100)}% of max pages for {url}")
thread_display.markdown(f"🧵 Threads active: `{crawler.active_threads}`")
thread_display.markdown("🧵 Threads active: `0`")
progress_display.progress(1.0, text="Done")
for r in results:
r["keyword"] = keyword
r["start_url"] = url
all_results.extend(results)
job_index += 1
progress_display.progress(job_index / total_jobs, text=f"Progress: {job_index}/{total_jobs}")
if all_results:
save_csv(all_results, filename)
st.session_state.search_history.append({
"timestamp": str(datetime.now()),
@@ -56,16 +86,40 @@ if st.button("Start Crawling"):
"file": filename
})
if all_results:
df = pd.DataFrame(all_results)
st.success(f"{len(all_results)} results found.")
st.dataframe(df)
df["match_type"] = df["file_type"].apply(lambda x: "📄 File" if x else "🌐 Page")
df["domain"] = df["url"].apply(lambda x: urlparse(x).netloc)
if show_only_files:
df = df[df["file_url"] != ""]
st.success(f"{len(df)} results found.")
st.dataframe(df[["match_type", "title", "keyword", "domain", "url", "file_url", "file_type", "preview"]])
with open(filename, "rb") as f:
st.download_button("📥 Download CSV", f, file_name=filename, mime="text/csv")
if show_chart and not df.empty:
freq_df = df["keyword"].value_counts().reset_index()
freq_df.columns = ["keyword", "matches"]
st.subheader("📊 Keyword Match Frequency")
chart = alt.Chart(freq_df).mark_bar().encode(
x=alt.X("keyword", sort="-y"),
y="matches",
tooltip=["keyword", "matches"]
).properties(height=250)
st.altair_chart(chart, use_container_width=True)
if show_domains and not df.empty:
dom_df = df["domain"].value_counts().reset_index()
dom_df.columns = ["domain", "hits"]
st.subheader("🌐 Domains with Most Matches")
st.dataframe(dom_df)
else:
st.warning("No matches found.")
# Sidebar search history
# Sidebar: History
st.sidebar.header("📁 Search History")
for entry in reversed(st.session_state.search_history[-5:]):
st.sidebar.write(f"🕓 {entry['timestamp']}")
@@ -73,3 +127,11 @@ for entry in reversed(st.session_state.search_history[-5:]):
st.sidebar.write("Keywords:", ", ".join(entry['keywords']))
st.sidebar.write(f"📎 File: `{entry['file']}`")
st.sidebar.markdown("---")
# Footer
st.markdown("""
<hr>
<div style='text-align: center; font-size: 0.9em;'>
Built by <a href='https://github.com/JoranJix/website-crawler' target='_blank'>@JoranJix</a> · MIT Licensed
</div>
""", unsafe_allow_html=True)