Update crawler_gui.py
This commit is contained in:
114
crawler_gui.py
114
crawler_gui.py
@@ -1,53 +1,83 @@
|
|||||||
import streamlit as st
|
import streamlit as st
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import time
|
|
||||||
from webcrawler import Crawler, save_csv
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from webcrawler import Crawler, save_csv
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
import time
|
||||||
|
import altair as alt
|
||||||
|
|
||||||
st.set_page_config(page_title="Multi-Site Web Crawler", layout="wide")
|
# Session init
|
||||||
st.title("🕷️ Web Crawler with Fulltext + Port Scanning")
|
|
||||||
|
|
||||||
# User input fields
|
|
||||||
start_urls_input = st.text_area("Start URLs (comma-separated)", "https://example.com, https://example.org")
|
|
||||||
keywords_input = st.text_input("Keywords (comma-separated)", "AI, privacy")
|
|
||||||
|
|
||||||
max_pages = st.slider("Maximum pages per site", 5, 1000, 200)
|
|
||||||
threads = st.slider("Number of threads per crawl", 1, 20, 5)
|
|
||||||
|
|
||||||
enable_scan = st.checkbox("🔌 Enable custom port scan")
|
|
||||||
port_input = st.text_input("Ports to scan (comma-separated)", "22,80,443,8080") if enable_scan else ""
|
|
||||||
|
|
||||||
# Session history
|
|
||||||
if "search_history" not in st.session_state:
|
if "search_history" not in st.session_state:
|
||||||
st.session_state.search_history = []
|
st.session_state.search_history = []
|
||||||
|
|
||||||
if st.button("Start Crawling"):
|
# Page setup
|
||||||
|
st.set_page_config(page_title="Web Crawler Pro", layout="wide")
|
||||||
|
st.title("🕷️ Web Crawler with File Search, Port Scan, Previews & Analytics")
|
||||||
|
|
||||||
|
# Inputs
|
||||||
|
start_urls_input = st.text_area("🌍 Start URLs (comma-separated)", "https://example.com, https://example.org")
|
||||||
|
keywords_input = st.text_input("🔎 Keywords (comma-separated)", "AI, privacy")
|
||||||
|
max_pages = st.slider("📄 Max pages per site", 5, 100, 20)
|
||||||
|
threads = st.slider("🧵 Threads per crawl", 1, 20, 5)
|
||||||
|
|
||||||
|
enable_scan = st.checkbox("🔌 Enable custom port scan")
|
||||||
|
port_input = st.text_input("Ports to scan (comma-separated)", "22,80,443") if enable_scan else ""
|
||||||
|
|
||||||
|
show_only_files = st.checkbox("📄 Show only file matches", value=False)
|
||||||
|
show_chart = st.checkbox("📊 Show keyword match chart", value=True)
|
||||||
|
show_domains = st.checkbox("🌐 Show domain stats", value=True)
|
||||||
|
|
||||||
|
start = st.button("🚀 Start Crawling")
|
||||||
|
log_display = st.empty()
|
||||||
|
thread_display = st.empty()
|
||||||
|
progress_display = st.empty()
|
||||||
|
|
||||||
|
if start:
|
||||||
urls = [u.strip() for u in start_urls_input.split(",") if u.strip()]
|
urls = [u.strip() for u in start_urls_input.split(",") if u.strip()]
|
||||||
keywords = [k.strip() for k in keywords_input.split(",") if k.strip()]
|
keywords = [k.strip() for k in keywords_input.split(",") if k.strip()]
|
||||||
ports = [int(p.strip()) for p in port_input.split(",") if p.strip().isdigit()] if enable_scan else []
|
ports = [int(p.strip()) for p in port_input.split(",") if p.strip().isdigit()] if enable_scan else []
|
||||||
|
|
||||||
all_results = []
|
all_results = []
|
||||||
|
total_jobs = len(urls) * len(keywords)
|
||||||
|
job_index = 0
|
||||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
filename = f"crawl_{timestamp}.csv"
|
filename = f"crawl_{timestamp}.csv"
|
||||||
|
|
||||||
with st.spinner("Crawling… please wait"):
|
with st.spinner("Crawling in progress…"):
|
||||||
for url in urls:
|
for url in urls:
|
||||||
for keyword in keywords:
|
for keyword in keywords:
|
||||||
st.write(f"🔍 Crawling **{url}** for keyword '**{keyword}**'...")
|
log_display.markdown(f"🔍 Crawling **{url}** for '**{keyword}**'...")
|
||||||
crawler = Crawler(
|
crawler = Crawler(
|
||||||
url,
|
start_url=url,
|
||||||
keyword,
|
keyword=keyword,
|
||||||
max_pages=max_pages,
|
max_pages=max_pages,
|
||||||
num_threads=threads,
|
num_threads=threads,
|
||||||
enable_portscan=enable_scan,
|
enable_portscan=enable_scan,
|
||||||
ports_to_scan=ports
|
ports_to_scan=ports
|
||||||
)
|
)
|
||||||
|
|
||||||
|
thread_display.markdown(f"🧵 Threads active: `{threads}`")
|
||||||
|
progress_display.progress(0.0, text=f"Scanning {url}…")
|
||||||
|
|
||||||
results = crawler.run()
|
results = crawler.run()
|
||||||
|
while any(t.is_alive() for t in crawler.active_thread_objects):
|
||||||
|
time.sleep(0.2)
|
||||||
|
ratio = min(len(crawler.visited) / max_pages, 1.0)
|
||||||
|
progress_display.progress(ratio, text=f"{int(ratio*100)}% of max pages for {url}")
|
||||||
|
thread_display.markdown(f"🧵 Threads active: `{crawler.active_threads}`")
|
||||||
|
|
||||||
|
thread_display.markdown("🧵 Threads active: `0`")
|
||||||
|
progress_display.progress(1.0, text="Done")
|
||||||
|
|
||||||
for r in results:
|
for r in results:
|
||||||
r["keyword"] = keyword
|
r["keyword"] = keyword
|
||||||
r["start_url"] = url
|
r["start_url"] = url
|
||||||
all_results.extend(results)
|
all_results.extend(results)
|
||||||
|
|
||||||
|
job_index += 1
|
||||||
|
progress_display.progress(job_index / total_jobs, text=f"Progress: {job_index}/{total_jobs}")
|
||||||
|
|
||||||
|
if all_results:
|
||||||
save_csv(all_results, filename)
|
save_csv(all_results, filename)
|
||||||
st.session_state.search_history.append({
|
st.session_state.search_history.append({
|
||||||
"timestamp": str(datetime.now()),
|
"timestamp": str(datetime.now()),
|
||||||
@@ -56,20 +86,52 @@ if st.button("Start Crawling"):
|
|||||||
"file": filename
|
"file": filename
|
||||||
})
|
})
|
||||||
|
|
||||||
if all_results:
|
|
||||||
df = pd.DataFrame(all_results)
|
df = pd.DataFrame(all_results)
|
||||||
st.success(f"✅ {len(all_results)} results found.")
|
df["match_type"] = df["file_type"].apply(lambda x: "📄 File" if x else "🌐 Page")
|
||||||
st.dataframe(df)
|
df["domain"] = df["url"].apply(lambda x: urlparse(x).netloc)
|
||||||
|
|
||||||
|
if show_only_files:
|
||||||
|
df = df[df["file_url"] != ""]
|
||||||
|
|
||||||
|
st.success(f"✅ {len(df)} results found.")
|
||||||
|
st.dataframe(df[["match_type", "title", "keyword", "domain", "url", "file_url", "file_type", "preview"]])
|
||||||
|
|
||||||
with open(filename, "rb") as f:
|
with open(filename, "rb") as f:
|
||||||
st.download_button("📥 Download CSV", f, file_name=filename, mime="text/csv")
|
st.download_button("📥 Download CSV", f, file_name=filename, mime="text/csv")
|
||||||
|
|
||||||
|
if show_chart and not df.empty:
|
||||||
|
freq_df = df["keyword"].value_counts().reset_index()
|
||||||
|
freq_df.columns = ["keyword", "matches"]
|
||||||
|
st.subheader("📊 Keyword Match Frequency")
|
||||||
|
chart = alt.Chart(freq_df).mark_bar().encode(
|
||||||
|
x=alt.X("keyword", sort="-y"),
|
||||||
|
y="matches",
|
||||||
|
tooltip=["keyword", "matches"]
|
||||||
|
).properties(height=250)
|
||||||
|
st.altair_chart(chart, use_container_width=True)
|
||||||
|
|
||||||
|
if show_domains and not df.empty:
|
||||||
|
dom_df = df["domain"].value_counts().reset_index()
|
||||||
|
dom_df.columns = ["domain", "hits"]
|
||||||
|
st.subheader("🌐 Domains with Most Matches")
|
||||||
|
st.dataframe(dom_df)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
st.warning("No matches found.")
|
st.warning("No matches found.")
|
||||||
|
|
||||||
# Sidebar search history
|
# Sidebar: History
|
||||||
st.sidebar.header("📁 Search History")
|
st.sidebar.header("📁 Search History")
|
||||||
for entry in reversed(st.session_state.search_history[-5:]):
|
for entry in reversed(st.session_state.search_history[-5:]):
|
||||||
st.sidebar.write(f"🕓 {entry['timestamp']}")
|
st.sidebar.write(f"🕓 {entry['timestamp']}")
|
||||||
st.sidebar.write("Start URLs:", ", ".join(entry['start_urls']))
|
st.sidebar.write("Start URLs:", ", ".join(entry['start_urls']))
|
||||||
st.sidebar.write("Keywords:", ", ".join(entry['keywords']))
|
st.sidebar.write("Keywords:", ", ".join(entry['keywords']))
|
||||||
st.sidebar.write(f"📎 File: `{entry['file']}`")
|
st.sidebar.write(f"📎 File: `{entry['file']}`")
|
||||||
st.sidebar.markdown("---")
|
st.sidebar.markdown("---")
|
||||||
|
|
||||||
|
# Footer
|
||||||
|
st.markdown("""
|
||||||
|
<hr>
|
||||||
|
<div style='text-align: center; font-size: 0.9em;'>
|
||||||
|
Built by <a href='https://github.com/JoranJix/website-crawler' target='_blank'>@JoranJix</a> · MIT Licensed
|
||||||
|
</div>
|
||||||
|
""", unsafe_allow_html=True)
|
||||||
|
|||||||
Reference in New Issue
Block a user