first commit

This commit is contained in:
Dr Marc 2025-09-25 12:49:25 +02:00
commit 4767d7b95d
7 changed files with 268 additions and 0 deletions

13
.gitignore vendored Normal file
View file

@ -0,0 +1,13 @@
# Python-generated files
__pycache__/
*.py[oc]
build/
dist/
wheels/
*.egg-info
# Virtual environments
.venv
.python-version
uv.lock
webscraping.db

0
README.md Normal file
View file

9
pyproject.toml Normal file
View file

@ -0,0 +1,9 @@
[project]
name = "webscraping"
version = "0.1.0"
description = "Web Scraping App with Forensic Features"
readme = "README.md"
requires-python = ">=3.12"
dependencies = [
"pyside6>=6.9.2",
]

1
src/__init__.py Normal file
View file

@ -0,0 +1 @@
# This file makes the src directory a package.

138
src/browser_window.py Normal file
View file

@ -0,0 +1,138 @@
# src/browser_window.py
from PySide6.QtWidgets import (
QMainWindow, QToolBar, QMenu, QMessageBox, QFileDialog, QInputDialog
)
from PySide6.QtGui import QAction
from PySide6.QtWebEngineWidgets import QWebEngineView
from PySide6.QtWebEngineCore import QWebEngineContextMenuRequest
from PySide6.QtCore import QUrl, Qt
from pathlib import Path
import io
from .db import init_db, save_page, save_screenshot, save_tag, save_event
class BrowserWindow(QMainWindow):
def __init__(self):
super().__init__()
self.setWindowTitle("LawEnforcement Web Scraper")
self.resize(1200, 800)
# ---- Web view ------------------------------------------------
self.view = QWebEngineView()
self.setCentralWidget(self.view)
# ---- Toolbar -------------------------------------------------
toolbar = QToolBar()
self.addToolBar(toolbar)
back_act = QAction("", self)
back_act.triggered.connect(self.view.back)
toolbar.addAction(back_act)
forward_act = QAction("", self)
forward_act.triggered.connect(self.view.forward)
toolbar.addAction(forward_act)
reload_act = QAction("", self)
reload_act.triggered.connect(self.view.reload)
toolbar.addAction(reload_act)
capture_act = QAction("📸 Capture", self)
capture_act.triggered.connect(self.capture_screenshot)
toolbar.addAction(capture_act)
# ---- Signals -------------------------------------------------
self.view.urlChanged.connect(self.on_url_changed)
self.view.loadFinished.connect(self.on_load_finished)
self.view.page().profile().downloadRequested.connect(self.on_download_requested)
# Contextmenu handling for image tagging
self.view.setContextMenuPolicy(Qt.ContextMenuPolicy.CustomContextMenu)
self.view.customContextMenuRequested.connect(self.show_context_menu)
# Initialise DB and state variables
init_db()
self.current_page_id = None
self.pending_url = ""
# ------------------------------------------------------------------
def on_url_changed(self, url: QUrl):
"""Remember the URL; HTML will be saved once the page finishes loading."""
self.pending_url = url.toString()
def on_load_finished(self, ok: bool):
if not ok:
QMessageBox.warning(self, "Load error", f"Failed to load {self.pending_url}")
return
# Grab the HTML source and persist it
self.view.page().toHtml(lambda html: self._store_page(html))
def _store_page(self, html: str):
# Save page record and keep its id for later screenshots/events
self.current_page_id = save_page(self.pending_url, html)
# Optional: automatically take a screenshot on load
# self.capture_screenshot()
# ------------------------------------------------------------------
def capture_screenshot(self):
if self.current_page_id is None:
QMessageBox.information(self, "Info", "No page loaded yet.")
return
def handle_pixmap(pix):
buffer = io.BytesIO()
pix.save(buffer, "PNG")
png_data = buffer.getvalue()
if self.current_page_id:
screenshot_id = save_screenshot(self.current_page_id, png_data)
self.prompt_tag(screenshot_id)
# grab() returns a QPixmap wrapped in a QFuture use then() callback
self.view.grab().then(handle_pixmap)
# ------------------------------------------------------------------
def show_context_menu(self, pos):
ctx: QWebEngineContextMenuRequest = self.view.page().contextMenuData()
if ctx.mediaType() == QWebEngineContextMenuRequest.MediaTypeImage:
menu = QMenu(self)
tag_act = QAction("Add tag to image", self)
tag_act.triggered.connect(lambda: self.tag_image(ctx))
menu.addAction(tag_act)
menu.exec_(self.view.mapToGlobal(pos))
def tag_image(self, ctx: QWebEngineContextMenuRequest):
# Download the image data, then store it as a screenshot for tagging
img_url = ctx.mediaUrl().toString()
# Use the download API to fetch the image bytes
profile = self.view.page().profile()
profile.downloadRequested.connect(
lambda req: self._handle_image_download(req, img_url)
)
# Trigger a temporary download request via JS (creates the request)
self.view.page().runJavaScript(f'new Image().src="{img_url}";')
def _handle_image_download(self, request, expected_url):
# Accept the request; when finished we can read its data
request.accept()
request.finished.connect(lambda: self._store_image_tag(request, expected_url))
def _store_image_tag(self, request, url):
# request.reply() is a QIODevice; read all bytes
data = request.reply().readAll().data()
screenshot_id = save_screenshot(self.current_page_id, data)
self.prompt_tag(screenshot_id)
def prompt_tag(self, screenshot_id: int):
tag, ok = QInputDialog.getText(self, "Tag image", "Enter tag:")
if ok and tag.strip():
save_tag(screenshot_id, tag.strip())
QMessageBox.information(self, "Tagged", f"Tag saved: {tag}")
# ------------------------------------------------------------------
def on_download_requested(self, request):
# For nonimage files you might want to let the user choose a location.
default_path = QFileDialog.getSaveFileName(self, "Save file", request.suggestedFileName())[0]
if default_path:
request.setDownloadFileName(Path(default_path).name)
request.accept()

94
src/db.py Normal file
View file

@ -0,0 +1,94 @@
# src/db.py
import sqlite3
from pathlib import Path
from datetime import datetime
# Database file will be placed at the project root
DB_PATH = Path(__file__).parent.parent / "webscraping.db"
def init_db():
"""Create tables if they do not exist."""
with sqlite3.connect(DB_PATH) as con:
cur = con.cursor()
# Store loaded pages (URL + raw HTML)
cur.execute("""CREATE TABLE IF NOT EXISTS pages (
id INTEGER PRIMARY KEY,
url TEXT NOT NULL,
html TEXT,
captured_at TEXT NOT NULL
)""")
# Screenshots (PNG BLOB) linked to a page
cur.execute("""CREATE TABLE IF NOT EXISTS screenshots (
id INTEGER PRIMARY KEY,
page_id INTEGER,
image BLOB,
captured_at TEXT NOT NULL,
FOREIGN KEY(page_id) REFERENCES pages(id)
)""")
# Tags for images/screenshots
cur.execute("""CREATE TABLE IF NOT EXISTS tags (
id INTEGER PRIMARY KEY,
screenshot_id INTEGER,
tag TEXT,
created_at TEXT NOT NULL,
FOREIGN KEY(screenshot_id) REFERENCES screenshots(id)
)""")
# Interaction events for replay (mouse, keyboard, etc.)
cur.execute("""CREATE TABLE IF NOT EXISTS events (
id INTEGER PRIMARY KEY,
page_id INTEGER,
event_type TEXT,
data TEXT,
occurred_at TEXT NOT NULL,
FOREIGN KEY(page_id) REFERENCES pages(id)
)""")
con.commit()
def _timestamp() -> str:
return datetime.utcnow().isoformat(timespec='seconds')
def save_page(url: str, html: str) -> int:
"""Insert a page record and return its row id."""
ts = _timestamp()
with sqlite3.connect(DB_PATH) as con:
cur = con.cursor()
cur.execute(
"INSERT INTO pages (url, html, captured_at) VALUES (?,?,?)",
(url, html, ts)
)
con.commit()
return cur.lastrowid
def save_screenshot(page_id: int, png_bytes: bytes) -> int:
"""Insert a screenshot linked to a page."""
ts = _timestamp()
with sqlite3.connect(DB_PATH) as con:
cur = con.cursor()
cur.execute(
"INSERT INTO screenshots (page_id, image, captured_at) VALUES (?,?,?)",
(page_id, png_bytes, ts)
)
con.commit()
return cur.lastrowid
def save_tag(screenshot_id: int, tag: str):
"""Add a textual tag to a screenshot."""
ts = _timestamp()
with sqlite3.connect(DB_PATH) as con:
cur = con.cursor()
cur.execute(
"INSERT INTO tags (screenshot_id, tag, created_at) VALUES (?,?,?)",
(screenshot_id, tag, ts)
)
con.commit()
def save_event(page_id: int, event_type: str, data: str):
"""Record an interaction event for later replay."""
ts = _timestamp()
with sqlite3.connect(DB_PATH) as con:
cur = con.cursor()
cur.execute(
"INSERT INTO events (page_id, event_type, data, occurred_at) VALUES (?,?,?,?)",
(page_id, event_type, data, ts)
)
con.commit()

13
src/main.py Normal file
View file

@ -0,0 +1,13 @@
# src/main.py
import sys
from PySide6.QtWidgets import QApplication
from .browser_window import BrowserWindow
def main():
app = QApplication(sys.argv)
win = BrowserWindow()
win.show()
sys.exit(app.exec())
if __name__ == "__main__":
main()