first commit
This commit is contained in:
commit
4767d7b95d
7 changed files with 268 additions and 0 deletions
13
.gitignore
vendored
Normal file
13
.gitignore
vendored
Normal file
|
|
@ -0,0 +1,13 @@
|
||||||
|
# Python-generated files
|
||||||
|
__pycache__/
|
||||||
|
*.py[oc]
|
||||||
|
build/
|
||||||
|
dist/
|
||||||
|
wheels/
|
||||||
|
*.egg-info
|
||||||
|
|
||||||
|
# Virtual environments
|
||||||
|
.venv
|
||||||
|
.python-version
|
||||||
|
uv.lock
|
||||||
|
webscraping.db
|
||||||
0
README.md
Normal file
0
README.md
Normal file
9
pyproject.toml
Normal file
9
pyproject.toml
Normal file
|
|
@ -0,0 +1,9 @@
|
||||||
|
[project]
|
||||||
|
name = "webscraping"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "Web Scraping App with Forensic Features"
|
||||||
|
readme = "README.md"
|
||||||
|
requires-python = ">=3.12"
|
||||||
|
dependencies = [
|
||||||
|
"pyside6>=6.9.2",
|
||||||
|
]
|
||||||
1
src/__init__.py
Normal file
1
src/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
# This file makes the src directory a package.
|
||||||
138
src/browser_window.py
Normal file
138
src/browser_window.py
Normal file
|
|
@ -0,0 +1,138 @@
|
||||||
|
# src/browser_window.py
|
||||||
|
from PySide6.QtWidgets import (
|
||||||
|
QMainWindow, QToolBar, QMenu, QMessageBox, QFileDialog, QInputDialog
|
||||||
|
)
|
||||||
|
from PySide6.QtGui import QAction
|
||||||
|
|
||||||
|
from PySide6.QtWebEngineWidgets import QWebEngineView
|
||||||
|
from PySide6.QtWebEngineCore import QWebEngineContextMenuRequest
|
||||||
|
from PySide6.QtCore import QUrl, Qt
|
||||||
|
from pathlib import Path
|
||||||
|
import io
|
||||||
|
|
||||||
|
from .db import init_db, save_page, save_screenshot, save_tag, save_event
|
||||||
|
|
||||||
|
class BrowserWindow(QMainWindow):
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.setWindowTitle("Law‑Enforcement Web Scraper")
|
||||||
|
self.resize(1200, 800)
|
||||||
|
|
||||||
|
# ---- Web view ------------------------------------------------
|
||||||
|
self.view = QWebEngineView()
|
||||||
|
self.setCentralWidget(self.view)
|
||||||
|
|
||||||
|
# ---- Toolbar -------------------------------------------------
|
||||||
|
toolbar = QToolBar()
|
||||||
|
self.addToolBar(toolbar)
|
||||||
|
|
||||||
|
back_act = QAction("←", self)
|
||||||
|
back_act.triggered.connect(self.view.back)
|
||||||
|
toolbar.addAction(back_act)
|
||||||
|
|
||||||
|
forward_act = QAction("→", self)
|
||||||
|
forward_act.triggered.connect(self.view.forward)
|
||||||
|
toolbar.addAction(forward_act)
|
||||||
|
|
||||||
|
reload_act = QAction("⟳", self)
|
||||||
|
reload_act.triggered.connect(self.view.reload)
|
||||||
|
toolbar.addAction(reload_act)
|
||||||
|
|
||||||
|
capture_act = QAction("📸 Capture", self)
|
||||||
|
capture_act.triggered.connect(self.capture_screenshot)
|
||||||
|
toolbar.addAction(capture_act)
|
||||||
|
|
||||||
|
# ---- Signals -------------------------------------------------
|
||||||
|
self.view.urlChanged.connect(self.on_url_changed)
|
||||||
|
self.view.loadFinished.connect(self.on_load_finished)
|
||||||
|
self.view.page().profile().downloadRequested.connect(self.on_download_requested)
|
||||||
|
|
||||||
|
# Context‑menu handling for image tagging
|
||||||
|
self.view.setContextMenuPolicy(Qt.ContextMenuPolicy.CustomContextMenu)
|
||||||
|
self.view.customContextMenuRequested.connect(self.show_context_menu)
|
||||||
|
|
||||||
|
# Initialise DB and state variables
|
||||||
|
init_db()
|
||||||
|
self.current_page_id = None
|
||||||
|
self.pending_url = ""
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
def on_url_changed(self, url: QUrl):
|
||||||
|
"""Remember the URL; HTML will be saved once the page finishes loading."""
|
||||||
|
self.pending_url = url.toString()
|
||||||
|
|
||||||
|
def on_load_finished(self, ok: bool):
|
||||||
|
if not ok:
|
||||||
|
QMessageBox.warning(self, "Load error", f"Failed to load {self.pending_url}")
|
||||||
|
return
|
||||||
|
# Grab the HTML source and persist it
|
||||||
|
self.view.page().toHtml(lambda html: self._store_page(html))
|
||||||
|
|
||||||
|
def _store_page(self, html: str):
|
||||||
|
# Save page record and keep its id for later screenshots/events
|
||||||
|
self.current_page_id = save_page(self.pending_url, html)
|
||||||
|
# Optional: automatically take a screenshot on load
|
||||||
|
# self.capture_screenshot()
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
def capture_screenshot(self):
|
||||||
|
if self.current_page_id is None:
|
||||||
|
QMessageBox.information(self, "Info", "No page loaded yet.")
|
||||||
|
return
|
||||||
|
|
||||||
|
def handle_pixmap(pix):
|
||||||
|
buffer = io.BytesIO()
|
||||||
|
pix.save(buffer, "PNG")
|
||||||
|
png_data = buffer.getvalue()
|
||||||
|
if self.current_page_id:
|
||||||
|
screenshot_id = save_screenshot(self.current_page_id, png_data)
|
||||||
|
self.prompt_tag(screenshot_id)
|
||||||
|
|
||||||
|
# grab() returns a QPixmap wrapped in a QFuture – use then() callback
|
||||||
|
self.view.grab().then(handle_pixmap)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
def show_context_menu(self, pos):
|
||||||
|
ctx: QWebEngineContextMenuRequest = self.view.page().contextMenuData()
|
||||||
|
if ctx.mediaType() == QWebEngineContextMenuRequest.MediaTypeImage:
|
||||||
|
menu = QMenu(self)
|
||||||
|
tag_act = QAction("Add tag to image", self)
|
||||||
|
tag_act.triggered.connect(lambda: self.tag_image(ctx))
|
||||||
|
menu.addAction(tag_act)
|
||||||
|
menu.exec_(self.view.mapToGlobal(pos))
|
||||||
|
|
||||||
|
def tag_image(self, ctx: QWebEngineContextMenuRequest):
|
||||||
|
# Download the image data, then store it as a screenshot for tagging
|
||||||
|
img_url = ctx.mediaUrl().toString()
|
||||||
|
# Use the download API to fetch the image bytes
|
||||||
|
profile = self.view.page().profile()
|
||||||
|
profile.downloadRequested.connect(
|
||||||
|
lambda req: self._handle_image_download(req, img_url)
|
||||||
|
)
|
||||||
|
# Trigger a temporary download request via JS (creates the request)
|
||||||
|
self.view.page().runJavaScript(f'new Image().src="{img_url}";')
|
||||||
|
|
||||||
|
def _handle_image_download(self, request, expected_url):
|
||||||
|
# Accept the request; when finished we can read its data
|
||||||
|
request.accept()
|
||||||
|
request.finished.connect(lambda: self._store_image_tag(request, expected_url))
|
||||||
|
|
||||||
|
def _store_image_tag(self, request, url):
|
||||||
|
# request.reply() is a QIODevice; read all bytes
|
||||||
|
data = request.reply().readAll().data()
|
||||||
|
screenshot_id = save_screenshot(self.current_page_id, data)
|
||||||
|
self.prompt_tag(screenshot_id)
|
||||||
|
|
||||||
|
def prompt_tag(self, screenshot_id: int):
|
||||||
|
tag, ok = QInputDialog.getText(self, "Tag image", "Enter tag:")
|
||||||
|
if ok and tag.strip():
|
||||||
|
save_tag(screenshot_id, tag.strip())
|
||||||
|
QMessageBox.information(self, "Tagged", f"Tag saved: {tag}")
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
def on_download_requested(self, request):
|
||||||
|
# For non‑image files you might want to let the user choose a location.
|
||||||
|
default_path = QFileDialog.getSaveFileName(self, "Save file", request.suggestedFileName())[0]
|
||||||
|
if default_path:
|
||||||
|
request.setDownloadFileName(Path(default_path).name)
|
||||||
|
request.accept()
|
||||||
94
src/db.py
Normal file
94
src/db.py
Normal file
|
|
@ -0,0 +1,94 @@
|
||||||
|
# src/db.py
|
||||||
|
import sqlite3
|
||||||
|
from pathlib import Path
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
# Database file will be placed at the project root
|
||||||
|
DB_PATH = Path(__file__).parent.parent / "webscraping.db"
|
||||||
|
|
||||||
|
def init_db():
|
||||||
|
"""Create tables if they do not exist."""
|
||||||
|
with sqlite3.connect(DB_PATH) as con:
|
||||||
|
cur = con.cursor()
|
||||||
|
# Store loaded pages (URL + raw HTML)
|
||||||
|
cur.execute("""CREATE TABLE IF NOT EXISTS pages (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
url TEXT NOT NULL,
|
||||||
|
html TEXT,
|
||||||
|
captured_at TEXT NOT NULL
|
||||||
|
)""")
|
||||||
|
# Screenshots (PNG BLOB) linked to a page
|
||||||
|
cur.execute("""CREATE TABLE IF NOT EXISTS screenshots (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
page_id INTEGER,
|
||||||
|
image BLOB,
|
||||||
|
captured_at TEXT NOT NULL,
|
||||||
|
FOREIGN KEY(page_id) REFERENCES pages(id)
|
||||||
|
)""")
|
||||||
|
# Tags for images/screenshots
|
||||||
|
cur.execute("""CREATE TABLE IF NOT EXISTS tags (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
screenshot_id INTEGER,
|
||||||
|
tag TEXT,
|
||||||
|
created_at TEXT NOT NULL,
|
||||||
|
FOREIGN KEY(screenshot_id) REFERENCES screenshots(id)
|
||||||
|
)""")
|
||||||
|
# Interaction events for replay (mouse, keyboard, etc.)
|
||||||
|
cur.execute("""CREATE TABLE IF NOT EXISTS events (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
page_id INTEGER,
|
||||||
|
event_type TEXT,
|
||||||
|
data TEXT,
|
||||||
|
occurred_at TEXT NOT NULL,
|
||||||
|
FOREIGN KEY(page_id) REFERENCES pages(id)
|
||||||
|
)""")
|
||||||
|
con.commit()
|
||||||
|
|
||||||
|
def _timestamp() -> str:
|
||||||
|
return datetime.utcnow().isoformat(timespec='seconds')
|
||||||
|
|
||||||
|
def save_page(url: str, html: str) -> int:
|
||||||
|
"""Insert a page record and return its row id."""
|
||||||
|
ts = _timestamp()
|
||||||
|
with sqlite3.connect(DB_PATH) as con:
|
||||||
|
cur = con.cursor()
|
||||||
|
cur.execute(
|
||||||
|
"INSERT INTO pages (url, html, captured_at) VALUES (?,?,?)",
|
||||||
|
(url, html, ts)
|
||||||
|
)
|
||||||
|
con.commit()
|
||||||
|
return cur.lastrowid
|
||||||
|
|
||||||
|
def save_screenshot(page_id: int, png_bytes: bytes) -> int:
|
||||||
|
"""Insert a screenshot linked to a page."""
|
||||||
|
ts = _timestamp()
|
||||||
|
with sqlite3.connect(DB_PATH) as con:
|
||||||
|
cur = con.cursor()
|
||||||
|
cur.execute(
|
||||||
|
"INSERT INTO screenshots (page_id, image, captured_at) VALUES (?,?,?)",
|
||||||
|
(page_id, png_bytes, ts)
|
||||||
|
)
|
||||||
|
con.commit()
|
||||||
|
return cur.lastrowid
|
||||||
|
|
||||||
|
def save_tag(screenshot_id: int, tag: str):
|
||||||
|
"""Add a textual tag to a screenshot."""
|
||||||
|
ts = _timestamp()
|
||||||
|
with sqlite3.connect(DB_PATH) as con:
|
||||||
|
cur = con.cursor()
|
||||||
|
cur.execute(
|
||||||
|
"INSERT INTO tags (screenshot_id, tag, created_at) VALUES (?,?,?)",
|
||||||
|
(screenshot_id, tag, ts)
|
||||||
|
)
|
||||||
|
con.commit()
|
||||||
|
|
||||||
|
def save_event(page_id: int, event_type: str, data: str):
|
||||||
|
"""Record an interaction event for later replay."""
|
||||||
|
ts = _timestamp()
|
||||||
|
with sqlite3.connect(DB_PATH) as con:
|
||||||
|
cur = con.cursor()
|
||||||
|
cur.execute(
|
||||||
|
"INSERT INTO events (page_id, event_type, data, occurred_at) VALUES (?,?,?,?)",
|
||||||
|
(page_id, event_type, data, ts)
|
||||||
|
)
|
||||||
|
con.commit()
|
||||||
13
src/main.py
Normal file
13
src/main.py
Normal file
|
|
@ -0,0 +1,13 @@
|
||||||
|
# src/main.py
|
||||||
|
import sys
|
||||||
|
from PySide6.QtWidgets import QApplication
|
||||||
|
from .browser_window import BrowserWindow
|
||||||
|
|
||||||
|
def main():
|
||||||
|
app = QApplication(sys.argv)
|
||||||
|
win = BrowserWindow()
|
||||||
|
win.show()
|
||||||
|
sys.exit(app.exec())
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
Add table
Add a link
Reference in a new issue