first commit
This commit is contained in:
commit
4767d7b95d
7 changed files with 268 additions and 0 deletions
13
.gitignore
vendored
Normal file
13
.gitignore
vendored
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
# Python-generated files
|
||||
__pycache__/
|
||||
*.py[oc]
|
||||
build/
|
||||
dist/
|
||||
wheels/
|
||||
*.egg-info
|
||||
|
||||
# Virtual environments
|
||||
.venv
|
||||
.python-version
|
||||
uv.lock
|
||||
webscraping.db
|
||||
0
README.md
Normal file
0
README.md
Normal file
9
pyproject.toml
Normal file
9
pyproject.toml
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
[project]
|
||||
name = "webscraping"
|
||||
version = "0.1.0"
|
||||
description = "Web Scraping App with Forensic Features"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.12"
|
||||
dependencies = [
|
||||
"pyside6>=6.9.2",
|
||||
]
|
||||
1
src/__init__.py
Normal file
1
src/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
# This file makes the src directory a package.
|
||||
138
src/browser_window.py
Normal file
138
src/browser_window.py
Normal file
|
|
@ -0,0 +1,138 @@
|
|||
# src/browser_window.py
|
||||
from PySide6.QtWidgets import (
|
||||
QMainWindow, QToolBar, QMenu, QMessageBox, QFileDialog, QInputDialog
|
||||
)
|
||||
from PySide6.QtGui import QAction
|
||||
|
||||
from PySide6.QtWebEngineWidgets import QWebEngineView
|
||||
from PySide6.QtWebEngineCore import QWebEngineContextMenuRequest
|
||||
from PySide6.QtCore import QUrl, Qt
|
||||
from pathlib import Path
|
||||
import io
|
||||
|
||||
from .db import init_db, save_page, save_screenshot, save_tag, save_event
|
||||
|
||||
class BrowserWindow(QMainWindow):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.setWindowTitle("Law‑Enforcement Web Scraper")
|
||||
self.resize(1200, 800)
|
||||
|
||||
# ---- Web view ------------------------------------------------
|
||||
self.view = QWebEngineView()
|
||||
self.setCentralWidget(self.view)
|
||||
|
||||
# ---- Toolbar -------------------------------------------------
|
||||
toolbar = QToolBar()
|
||||
self.addToolBar(toolbar)
|
||||
|
||||
back_act = QAction("←", self)
|
||||
back_act.triggered.connect(self.view.back)
|
||||
toolbar.addAction(back_act)
|
||||
|
||||
forward_act = QAction("→", self)
|
||||
forward_act.triggered.connect(self.view.forward)
|
||||
toolbar.addAction(forward_act)
|
||||
|
||||
reload_act = QAction("⟳", self)
|
||||
reload_act.triggered.connect(self.view.reload)
|
||||
toolbar.addAction(reload_act)
|
||||
|
||||
capture_act = QAction("📸 Capture", self)
|
||||
capture_act.triggered.connect(self.capture_screenshot)
|
||||
toolbar.addAction(capture_act)
|
||||
|
||||
# ---- Signals -------------------------------------------------
|
||||
self.view.urlChanged.connect(self.on_url_changed)
|
||||
self.view.loadFinished.connect(self.on_load_finished)
|
||||
self.view.page().profile().downloadRequested.connect(self.on_download_requested)
|
||||
|
||||
# Context‑menu handling for image tagging
|
||||
self.view.setContextMenuPolicy(Qt.ContextMenuPolicy.CustomContextMenu)
|
||||
self.view.customContextMenuRequested.connect(self.show_context_menu)
|
||||
|
||||
# Initialise DB and state variables
|
||||
init_db()
|
||||
self.current_page_id = None
|
||||
self.pending_url = ""
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
def on_url_changed(self, url: QUrl):
|
||||
"""Remember the URL; HTML will be saved once the page finishes loading."""
|
||||
self.pending_url = url.toString()
|
||||
|
||||
def on_load_finished(self, ok: bool):
|
||||
if not ok:
|
||||
QMessageBox.warning(self, "Load error", f"Failed to load {self.pending_url}")
|
||||
return
|
||||
# Grab the HTML source and persist it
|
||||
self.view.page().toHtml(lambda html: self._store_page(html))
|
||||
|
||||
def _store_page(self, html: str):
|
||||
# Save page record and keep its id for later screenshots/events
|
||||
self.current_page_id = save_page(self.pending_url, html)
|
||||
# Optional: automatically take a screenshot on load
|
||||
# self.capture_screenshot()
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
def capture_screenshot(self):
|
||||
if self.current_page_id is None:
|
||||
QMessageBox.information(self, "Info", "No page loaded yet.")
|
||||
return
|
||||
|
||||
def handle_pixmap(pix):
|
||||
buffer = io.BytesIO()
|
||||
pix.save(buffer, "PNG")
|
||||
png_data = buffer.getvalue()
|
||||
if self.current_page_id:
|
||||
screenshot_id = save_screenshot(self.current_page_id, png_data)
|
||||
self.prompt_tag(screenshot_id)
|
||||
|
||||
# grab() returns a QPixmap wrapped in a QFuture – use then() callback
|
||||
self.view.grab().then(handle_pixmap)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
def show_context_menu(self, pos):
|
||||
ctx: QWebEngineContextMenuRequest = self.view.page().contextMenuData()
|
||||
if ctx.mediaType() == QWebEngineContextMenuRequest.MediaTypeImage:
|
||||
menu = QMenu(self)
|
||||
tag_act = QAction("Add tag to image", self)
|
||||
tag_act.triggered.connect(lambda: self.tag_image(ctx))
|
||||
menu.addAction(tag_act)
|
||||
menu.exec_(self.view.mapToGlobal(pos))
|
||||
|
||||
def tag_image(self, ctx: QWebEngineContextMenuRequest):
|
||||
# Download the image data, then store it as a screenshot for tagging
|
||||
img_url = ctx.mediaUrl().toString()
|
||||
# Use the download API to fetch the image bytes
|
||||
profile = self.view.page().profile()
|
||||
profile.downloadRequested.connect(
|
||||
lambda req: self._handle_image_download(req, img_url)
|
||||
)
|
||||
# Trigger a temporary download request via JS (creates the request)
|
||||
self.view.page().runJavaScript(f'new Image().src="{img_url}";')
|
||||
|
||||
def _handle_image_download(self, request, expected_url):
|
||||
# Accept the request; when finished we can read its data
|
||||
request.accept()
|
||||
request.finished.connect(lambda: self._store_image_tag(request, expected_url))
|
||||
|
||||
def _store_image_tag(self, request, url):
|
||||
# request.reply() is a QIODevice; read all bytes
|
||||
data = request.reply().readAll().data()
|
||||
screenshot_id = save_screenshot(self.current_page_id, data)
|
||||
self.prompt_tag(screenshot_id)
|
||||
|
||||
def prompt_tag(self, screenshot_id: int):
|
||||
tag, ok = QInputDialog.getText(self, "Tag image", "Enter tag:")
|
||||
if ok and tag.strip():
|
||||
save_tag(screenshot_id, tag.strip())
|
||||
QMessageBox.information(self, "Tagged", f"Tag saved: {tag}")
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
def on_download_requested(self, request):
|
||||
# For non‑image files you might want to let the user choose a location.
|
||||
default_path = QFileDialog.getSaveFileName(self, "Save file", request.suggestedFileName())[0]
|
||||
if default_path:
|
||||
request.setDownloadFileName(Path(default_path).name)
|
||||
request.accept()
|
||||
94
src/db.py
Normal file
94
src/db.py
Normal file
|
|
@ -0,0 +1,94 @@
|
|||
# src/db.py
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
# Database file will be placed at the project root
|
||||
DB_PATH = Path(__file__).parent.parent / "webscraping.db"
|
||||
|
||||
def init_db():
|
||||
"""Create tables if they do not exist."""
|
||||
with sqlite3.connect(DB_PATH) as con:
|
||||
cur = con.cursor()
|
||||
# Store loaded pages (URL + raw HTML)
|
||||
cur.execute("""CREATE TABLE IF NOT EXISTS pages (
|
||||
id INTEGER PRIMARY KEY,
|
||||
url TEXT NOT NULL,
|
||||
html TEXT,
|
||||
captured_at TEXT NOT NULL
|
||||
)""")
|
||||
# Screenshots (PNG BLOB) linked to a page
|
||||
cur.execute("""CREATE TABLE IF NOT EXISTS screenshots (
|
||||
id INTEGER PRIMARY KEY,
|
||||
page_id INTEGER,
|
||||
image BLOB,
|
||||
captured_at TEXT NOT NULL,
|
||||
FOREIGN KEY(page_id) REFERENCES pages(id)
|
||||
)""")
|
||||
# Tags for images/screenshots
|
||||
cur.execute("""CREATE TABLE IF NOT EXISTS tags (
|
||||
id INTEGER PRIMARY KEY,
|
||||
screenshot_id INTEGER,
|
||||
tag TEXT,
|
||||
created_at TEXT NOT NULL,
|
||||
FOREIGN KEY(screenshot_id) REFERENCES screenshots(id)
|
||||
)""")
|
||||
# Interaction events for replay (mouse, keyboard, etc.)
|
||||
cur.execute("""CREATE TABLE IF NOT EXISTS events (
|
||||
id INTEGER PRIMARY KEY,
|
||||
page_id INTEGER,
|
||||
event_type TEXT,
|
||||
data TEXT,
|
||||
occurred_at TEXT NOT NULL,
|
||||
FOREIGN KEY(page_id) REFERENCES pages(id)
|
||||
)""")
|
||||
con.commit()
|
||||
|
||||
def _timestamp() -> str:
|
||||
return datetime.utcnow().isoformat(timespec='seconds')
|
||||
|
||||
def save_page(url: str, html: str) -> int:
|
||||
"""Insert a page record and return its row id."""
|
||||
ts = _timestamp()
|
||||
with sqlite3.connect(DB_PATH) as con:
|
||||
cur = con.cursor()
|
||||
cur.execute(
|
||||
"INSERT INTO pages (url, html, captured_at) VALUES (?,?,?)",
|
||||
(url, html, ts)
|
||||
)
|
||||
con.commit()
|
||||
return cur.lastrowid
|
||||
|
||||
def save_screenshot(page_id: int, png_bytes: bytes) -> int:
|
||||
"""Insert a screenshot linked to a page."""
|
||||
ts = _timestamp()
|
||||
with sqlite3.connect(DB_PATH) as con:
|
||||
cur = con.cursor()
|
||||
cur.execute(
|
||||
"INSERT INTO screenshots (page_id, image, captured_at) VALUES (?,?,?)",
|
||||
(page_id, png_bytes, ts)
|
||||
)
|
||||
con.commit()
|
||||
return cur.lastrowid
|
||||
|
||||
def save_tag(screenshot_id: int, tag: str):
|
||||
"""Add a textual tag to a screenshot."""
|
||||
ts = _timestamp()
|
||||
with sqlite3.connect(DB_PATH) as con:
|
||||
cur = con.cursor()
|
||||
cur.execute(
|
||||
"INSERT INTO tags (screenshot_id, tag, created_at) VALUES (?,?,?)",
|
||||
(screenshot_id, tag, ts)
|
||||
)
|
||||
con.commit()
|
||||
|
||||
def save_event(page_id: int, event_type: str, data: str):
|
||||
"""Record an interaction event for later replay."""
|
||||
ts = _timestamp()
|
||||
with sqlite3.connect(DB_PATH) as con:
|
||||
cur = con.cursor()
|
||||
cur.execute(
|
||||
"INSERT INTO events (page_id, event_type, data, occurred_at) VALUES (?,?,?,?)",
|
||||
(page_id, event_type, data, ts)
|
||||
)
|
||||
con.commit()
|
||||
13
src/main.py
Normal file
13
src/main.py
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
# src/main.py
|
||||
import sys
|
||||
from PySide6.QtWidgets import QApplication
|
||||
from .browser_window import BrowserWindow
|
||||
|
||||
def main():
|
||||
app = QApplication(sys.argv)
|
||||
win = BrowserWindow()
|
||||
win.show()
|
||||
sys.exit(app.exec())
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue