Update crawler_gui.py

2025-06-26 12:52:05 +02:00
parent d00fca79ed
commit c63ab23332
1 changed files with 88 additions and 26 deletions
--- a/crawler_gui.py
+++ b/crawler_gui.py
@@ -1,53 +1,83 @@
 import streamlit as st
 import pandas as pd
 import time
 from webcrawler import Crawler, save_csv
 from datetime import datetime
 from webcrawler import Crawler, save_csv
 from urllib.parse import urlparse
 import time
 import altair as alt
-st.set_page_config(page_title="Multi-Site Web Crawler", layout="wide")
+# Session init
 st.title("🕷️ Web Crawler with Fulltext + Port Scanning")
 # User input fields
 start_urls_input = st.text_area("Start URLs (comma-separated)", "https://example.com, https://example.org")
 keywords_input = st.text_input("Keywords (comma-separated)", "AI, privacy")
 max_pages = st.slider("Maximum pages per site", 5, 1000, 200)
 threads = st.slider("Number of threads per crawl", 1, 20, 5)
 enable_scan = st.checkbox("🔌 Enable custom port scan")
 port_input = st.text_input("Ports to scan (comma-separated)", "22,80,443,8080") if enable_scan else ""
 # Session history
 if "search_history" not in st.session_state:
    st.session_state.search_history = []
-if st.button("Start Crawling"):
+# Page setup
 st.set_page_config(page_title="Web Crawler Pro", layout="wide")
 st.title("🕷️ Web Crawler with File Search, Port Scan, Previews & Analytics")
 # Inputs
 start_urls_input = st.text_area("🌍 Start URLs (comma-separated)", "https://example.com, https://example.org")
 keywords_input = st.text_input("🔎 Keywords (comma-separated)", "AI, privacy")
 max_pages = st.slider("📄 Max pages per site", 5, 100, 20)
 threads = st.slider("🧵 Threads per crawl", 1, 20, 5)
 enable_scan = st.checkbox("🔌 Enable custom port scan")
 port_input = st.text_input("Ports to scan (comma-separated)", "22,80,443") if enable_scan else ""
 show_only_files = st.checkbox("📄 Show only file matches", value=False)
 show_chart = st.checkbox("📊 Show keyword match chart", value=True)
 show_domains = st.checkbox("🌐 Show domain stats", value=True)
 start = st.button("🚀 Start Crawling")
 log_display = st.empty()
 thread_display = st.empty()
 progress_display = st.empty()
 if start:
    urls = [u.strip() for u in start_urls_input.split(",") if u.strip()]
    keywords = [k.strip() for k in keywords_input.split(",") if k.strip()]
    ports = [int(p.strip()) for p in port_input.split(",") if p.strip().isdigit()] if enable_scan else []
    all_results = []
    total_jobs = len(urls) * len(keywords)
    job_index = 0
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"crawl_{timestamp}.csv"
-    with st.spinner("Crawling… please wait"):
+    with st.spinner("Crawling in progress…"):
        for url in urls:
            for keyword in keywords:
-                st.write(f"🔍 Crawling **{url}** for keyword '**{keyword}**'...")
+                log_display.markdown(f"🔍 Crawling **{url}** for '**{keyword}**'...")
                crawler = Crawler(
-                    url,
+                    start_url=url,
-                    keyword,
+                    keyword=keyword,
                    max_pages=max_pages,
                    num_threads=threads,
                    enable_portscan=enable_scan,
                    ports_to_scan=ports
                )
                thread_display.markdown(f"🧵 Threads active: `{threads}`")
                progress_display.progress(0.0, text=f"Scanning {url}…")
                results = crawler.run()
                while any(t.is_alive() for t in crawler.active_thread_objects):
                    time.sleep(0.2)
                    ratio = min(len(crawler.visited) / max_pages, 1.0)
                    progress_display.progress(ratio, text=f"{int(ratio*100)}% of max pages for {url}")
                    thread_display.markdown(f"🧵 Threads active: `{crawler.active_threads}`")
                thread_display.markdown("🧵 Threads active: `0`")
                progress_display.progress(1.0, text="Done")
                for r in results:
                    r["keyword"] = keyword
                    r["start_url"] = url
                all_results.extend(results)
                job_index += 1
                progress_display.progress(job_index / total_jobs, text=f"Progress: {job_index}/{total_jobs}")
    if all_results:
        save_csv(all_results, filename)
        st.session_state.search_history.append({
            "timestamp": str(datetime.now()),
@@ -56,20 +86,52 @@ if st.button("Start Crawling"):
            "file": filename
        })
    if all_results:
        df = pd.DataFrame(all_results)
-        st.success(f"✅ {len(all_results)} results found.")
+        df["match_type"] = df["file_type"].apply(lambda x: "📄 File" if x else "🌐 Page")
-        st.dataframe(df)
+        df["domain"] = df["url"].apply(lambda x: urlparse(x).netloc)
        if show_only_files:
            df = df[df["file_url"] != ""]
        st.success(f"✅ {len(df)} results found.")
        st.dataframe(df[["match_type", "title", "keyword", "domain", "url", "file_url", "file_type", "preview"]])
        with open(filename, "rb") as f:
            st.download_button("📥 Download CSV", f, file_name=filename, mime="text/csv")
        if show_chart and not df.empty:
            freq_df = df["keyword"].value_counts().reset_index()
            freq_df.columns = ["keyword", "matches"]
            st.subheader("📊 Keyword Match Frequency")
            chart = alt.Chart(freq_df).mark_bar().encode(
                x=alt.X("keyword", sort="-y"),
                y="matches",
                tooltip=["keyword", "matches"]
            ).properties(height=250)
            st.altair_chart(chart, use_container_width=True)
        if show_domains and not df.empty:
            dom_df = df["domain"].value_counts().reset_index()
            dom_df.columns = ["domain", "hits"]
            st.subheader("🌐 Domains with Most Matches")
            st.dataframe(dom_df)
    else:
        st.warning("No matches found.")
-# Sidebar search history
+# Sidebar: History
 st.sidebar.header("📁 Search History")
 for entry in reversed(st.session_state.search_history[-5:]):
    st.sidebar.write(f"🕓 {entry['timestamp']}")
    st.sidebar.write("Start URLs:", ", ".join(entry['start_urls']))
    st.sidebar.write("Keywords:", ", ".join(entry['keywords']))
    st.sidebar.write(f"📎 File: `{entry['file']}`")
-    st.sidebar.markdown("---")
+    st.sidebar.markdown("---")
 # Footer
 st.markdown("""
 <hr>
 <div style='text-align: center; font-size: 0.9em;'>
  Built by <a href='https://github.com/JoranJix/website-crawler' target='_blank'>@JoranJix</a> · MIT Licensed
 </div>
 """, unsafe_allow_html=True)