From 4767d7b95d0f28edcffd438526da16935eeff4e3 Mon Sep 17 00:00:00 2001 From: Dr Marc Date: Thu, 25 Sep 2025 12:49:25 +0200 Subject: [PATCH] first commit --- .gitignore | 13 ++++ README.md | 0 pyproject.toml | 9 +++ src/__init__.py | 1 + src/browser_window.py | 138 ++++++++++++++++++++++++++++++++++++++++++ src/db.py | 94 ++++++++++++++++++++++++++++ src/main.py | 13 ++++ 7 files changed, 268 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 pyproject.toml create mode 100644 src/__init__.py create mode 100644 src/browser_window.py create mode 100644 src/db.py create mode 100644 src/main.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..11f28fd --- /dev/null +++ b/.gitignore @@ -0,0 +1,13 @@ +# Python-generated files +__pycache__/ +*.py[oc] +build/ +dist/ +wheels/ +*.egg-info + +# Virtual environments +.venv +.python-version +uv.lock +webscraping.db diff --git a/README.md b/README.md new file mode 100644 index 0000000..e69de29 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..eedac9c --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,9 @@ +[project] +name = "webscraping" +version = "0.1.0" +description = "Web Scraping App with Forensic Features" +readme = "README.md" +requires-python = ">=3.12" +dependencies = [ + "pyside6>=6.9.2", +] diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..73b6c9b --- /dev/null +++ b/src/__init__.py @@ -0,0 +1 @@ +# This file makes the src directory a package. diff --git a/src/browser_window.py b/src/browser_window.py new file mode 100644 index 0000000..2cda8cc --- /dev/null +++ b/src/browser_window.py @@ -0,0 +1,138 @@ +# src/browser_window.py +from PySide6.QtWidgets import ( + QMainWindow, QToolBar, QMenu, QMessageBox, QFileDialog, QInputDialog +) +from PySide6.QtGui import QAction + +from PySide6.QtWebEngineWidgets import QWebEngineView +from PySide6.QtWebEngineCore import QWebEngineContextMenuRequest +from PySide6.QtCore import QUrl, Qt +from pathlib import Path +import io + +from .db import init_db, save_page, save_screenshot, save_tag, save_event + +class BrowserWindow(QMainWindow): + def __init__(self): + super().__init__() + self.setWindowTitle("Law‑Enforcement Web Scraper") + self.resize(1200, 800) + + # ---- Web view ------------------------------------------------ + self.view = QWebEngineView() + self.setCentralWidget(self.view) + + # ---- Toolbar ------------------------------------------------- + toolbar = QToolBar() + self.addToolBar(toolbar) + + back_act = QAction("←", self) + back_act.triggered.connect(self.view.back) + toolbar.addAction(back_act) + + forward_act = QAction("→", self) + forward_act.triggered.connect(self.view.forward) + toolbar.addAction(forward_act) + + reload_act = QAction("⟳", self) + reload_act.triggered.connect(self.view.reload) + toolbar.addAction(reload_act) + + capture_act = QAction("📸 Capture", self) + capture_act.triggered.connect(self.capture_screenshot) + toolbar.addAction(capture_act) + + # ---- Signals ------------------------------------------------- + self.view.urlChanged.connect(self.on_url_changed) + self.view.loadFinished.connect(self.on_load_finished) + self.view.page().profile().downloadRequested.connect(self.on_download_requested) + + # Context‑menu handling for image tagging + self.view.setContextMenuPolicy(Qt.ContextMenuPolicy.CustomContextMenu) + self.view.customContextMenuRequested.connect(self.show_context_menu) + + # Initialise DB and state variables + init_db() + self.current_page_id = None + self.pending_url = "" + + # ------------------------------------------------------------------ + def on_url_changed(self, url: QUrl): + """Remember the URL; HTML will be saved once the page finishes loading.""" + self.pending_url = url.toString() + + def on_load_finished(self, ok: bool): + if not ok: + QMessageBox.warning(self, "Load error", f"Failed to load {self.pending_url}") + return + # Grab the HTML source and persist it + self.view.page().toHtml(lambda html: self._store_page(html)) + + def _store_page(self, html: str): + # Save page record and keep its id for later screenshots/events + self.current_page_id = save_page(self.pending_url, html) + # Optional: automatically take a screenshot on load + # self.capture_screenshot() + + # ------------------------------------------------------------------ + def capture_screenshot(self): + if self.current_page_id is None: + QMessageBox.information(self, "Info", "No page loaded yet.") + return + + def handle_pixmap(pix): + buffer = io.BytesIO() + pix.save(buffer, "PNG") + png_data = buffer.getvalue() + if self.current_page_id: + screenshot_id = save_screenshot(self.current_page_id, png_data) + self.prompt_tag(screenshot_id) + + # grab() returns a QPixmap wrapped in a QFuture – use then() callback + self.view.grab().then(handle_pixmap) + + # ------------------------------------------------------------------ + def show_context_menu(self, pos): + ctx: QWebEngineContextMenuRequest = self.view.page().contextMenuData() + if ctx.mediaType() == QWebEngineContextMenuRequest.MediaTypeImage: + menu = QMenu(self) + tag_act = QAction("Add tag to image", self) + tag_act.triggered.connect(lambda: self.tag_image(ctx)) + menu.addAction(tag_act) + menu.exec_(self.view.mapToGlobal(pos)) + + def tag_image(self, ctx: QWebEngineContextMenuRequest): + # Download the image data, then store it as a screenshot for tagging + img_url = ctx.mediaUrl().toString() + # Use the download API to fetch the image bytes + profile = self.view.page().profile() + profile.downloadRequested.connect( + lambda req: self._handle_image_download(req, img_url) + ) + # Trigger a temporary download request via JS (creates the request) + self.view.page().runJavaScript(f'new Image().src="{img_url}";') + + def _handle_image_download(self, request, expected_url): + # Accept the request; when finished we can read its data + request.accept() + request.finished.connect(lambda: self._store_image_tag(request, expected_url)) + + def _store_image_tag(self, request, url): + # request.reply() is a QIODevice; read all bytes + data = request.reply().readAll().data() + screenshot_id = save_screenshot(self.current_page_id, data) + self.prompt_tag(screenshot_id) + + def prompt_tag(self, screenshot_id: int): + tag, ok = QInputDialog.getText(self, "Tag image", "Enter tag:") + if ok and tag.strip(): + save_tag(screenshot_id, tag.strip()) + QMessageBox.information(self, "Tagged", f"Tag saved: {tag}") + + # ------------------------------------------------------------------ + def on_download_requested(self, request): + # For non‑image files you might want to let the user choose a location. + default_path = QFileDialog.getSaveFileName(self, "Save file", request.suggestedFileName())[0] + if default_path: + request.setDownloadFileName(Path(default_path).name) + request.accept() diff --git a/src/db.py b/src/db.py new file mode 100644 index 0000000..34a0c8b --- /dev/null +++ b/src/db.py @@ -0,0 +1,94 @@ +# src/db.py +import sqlite3 +from pathlib import Path +from datetime import datetime + +# Database file will be placed at the project root +DB_PATH = Path(__file__).parent.parent / "webscraping.db" + +def init_db(): + """Create tables if they do not exist.""" + with sqlite3.connect(DB_PATH) as con: + cur = con.cursor() + # Store loaded pages (URL + raw HTML) + cur.execute("""CREATE TABLE IF NOT EXISTS pages ( + id INTEGER PRIMARY KEY, + url TEXT NOT NULL, + html TEXT, + captured_at TEXT NOT NULL + )""") + # Screenshots (PNG BLOB) linked to a page + cur.execute("""CREATE TABLE IF NOT EXISTS screenshots ( + id INTEGER PRIMARY KEY, + page_id INTEGER, + image BLOB, + captured_at TEXT NOT NULL, + FOREIGN KEY(page_id) REFERENCES pages(id) + )""") + # Tags for images/screenshots + cur.execute("""CREATE TABLE IF NOT EXISTS tags ( + id INTEGER PRIMARY KEY, + screenshot_id INTEGER, + tag TEXT, + created_at TEXT NOT NULL, + FOREIGN KEY(screenshot_id) REFERENCES screenshots(id) + )""") + # Interaction events for replay (mouse, keyboard, etc.) + cur.execute("""CREATE TABLE IF NOT EXISTS events ( + id INTEGER PRIMARY KEY, + page_id INTEGER, + event_type TEXT, + data TEXT, + occurred_at TEXT NOT NULL, + FOREIGN KEY(page_id) REFERENCES pages(id) + )""") + con.commit() + +def _timestamp() -> str: + return datetime.utcnow().isoformat(timespec='seconds') + +def save_page(url: str, html: str) -> int: + """Insert a page record and return its row id.""" + ts = _timestamp() + with sqlite3.connect(DB_PATH) as con: + cur = con.cursor() + cur.execute( + "INSERT INTO pages (url, html, captured_at) VALUES (?,?,?)", + (url, html, ts) + ) + con.commit() + return cur.lastrowid + +def save_screenshot(page_id: int, png_bytes: bytes) -> int: + """Insert a screenshot linked to a page.""" + ts = _timestamp() + with sqlite3.connect(DB_PATH) as con: + cur = con.cursor() + cur.execute( + "INSERT INTO screenshots (page_id, image, captured_at) VALUES (?,?,?)", + (page_id, png_bytes, ts) + ) + con.commit() + return cur.lastrowid + +def save_tag(screenshot_id: int, tag: str): + """Add a textual tag to a screenshot.""" + ts = _timestamp() + with sqlite3.connect(DB_PATH) as con: + cur = con.cursor() + cur.execute( + "INSERT INTO tags (screenshot_id, tag, created_at) VALUES (?,?,?)", + (screenshot_id, tag, ts) + ) + con.commit() + +def save_event(page_id: int, event_type: str, data: str): + """Record an interaction event for later replay.""" + ts = _timestamp() + with sqlite3.connect(DB_PATH) as con: + cur = con.cursor() + cur.execute( + "INSERT INTO events (page_id, event_type, data, occurred_at) VALUES (?,?,?,?)", + (page_id, event_type, data, ts) + ) + con.commit() \ No newline at end of file diff --git a/src/main.py b/src/main.py new file mode 100644 index 0000000..af4e441 --- /dev/null +++ b/src/main.py @@ -0,0 +1,13 @@ +# src/main.py +import sys +from PySide6.QtWidgets import QApplication +from .browser_window import BrowserWindow + +def main(): + app = QApplication(sys.argv) + win = BrowserWindow() + win.show() + sys.exit(app.exec()) + +if __name__ == "__main__": + main()