From d0a014705bf05b813da68bbdbcd5ec53dddf6627 Mon Sep 17 00:00:00 2001 From: Kibi Kelburton Date: Wed, 20 May 2026 19:10:50 +0200 Subject: [PATCH] updating phash/dhash generation --- src/comment_upload_handler.mjs | 73 +++---------- src/inc/queue.mjs | 187 ++++++++++++++++++++++----------- 2 files changed, 140 insertions(+), 120 deletions(-) diff --git a/src/comment_upload_handler.mjs b/src/comment_upload_handler.mjs index 0816ef2..600837e 100644 --- a/src/comment_upload_handler.mjs +++ b/src/comment_upload_handler.mjs @@ -374,29 +374,23 @@ export const handleCommentUpload = async (req, res) => { try { phash = await queue.generatePHash(tmpPath); if (phash && !linkedToExisting) { - // Check comment_files for visual duplicate - const cfItems = await db` - SELECT id, phash, dest FROM comment_files - WHERE phash IS NOT NULL AND phash != '' AND phash NOT LIKE '00000000%' - `; - for (const cf of cfItems) { - if (isPhashMatch(phash, cf.phash)) { - const existingAbsPath = path.join(cfg.paths.c, cf.dest); - try { - const realTarget = await fs.realpath(existingAbsPath); - const destPath = path.join(cfg.paths.c, filename); - const relTarget = path.relative(path.dirname(destPath), realTarget); - await fs.symlink(relTarget, destPath); - linkedToExisting = true; - console.log(`[COMMENT_UPLOAD] PHash match in comment_files: ${filename} → ${relTarget}`); - } catch (e) { - console.error(`[COMMENT_UPLOAD] PHash symlink failed:`, e.message); - } - break; + // Check comment_files for visual duplicate using fast SQL query + const commentMatch = await queue.checkcommentrepostphash(phash); + if (commentMatch) { + const existingAbsPath = path.join(cfg.paths.c, commentMatch.dest); + try { + const realTarget = await fs.realpath(existingAbsPath); + const destPath = path.join(cfg.paths.c, filename); + const relTarget = path.relative(path.dirname(destPath), realTarget); + await fs.symlink(relTarget, destPath); + linkedToExisting = true; + console.log(`[COMMENT_UPLOAD] PHash match in comment_files: ${filename} → ${relTarget}`); + } catch (e) { + console.error(`[COMMENT_UPLOAD] PHash symlink failed:`, e.message); } } - - // Also check items table for visual duplicate + + // Also check items table for visual duplicate using fast SQL query if (!linkedToExisting) { const phashMatch = await queue.checkrepostphash(phash); if (phashMatch) { @@ -541,41 +535,4 @@ async function generateCommentThumbnail(filename, mime, uuid, size = 512) { await fs.unlink(tmpFile).catch(() => { }); } -/** - * PHash matching helper (same logic as queue.checkrepostphash) - */ -function isPhashMatch(newHash, dbHash) { - if (!newHash || !dbHash) return false; - const newHashes = newHash.split('_'); - const dbHashes = dbHash.split('_'); - const THRESHOLD = 15; - const getHammingDistance = (h1, h2) => { - if (!h1 || !h2 || h1.length !== h2.length) return 9999; - let distance = 0; - for (let i = 0; i < h1.length; i += 2) { - const v1 = parseInt(h1.substr(i, 2), 16); - const v2 = parseInt(h2.substr(i, 2), 16); - let xor = v1 ^ v2; - while (xor) { - distance += xor & 1; - xor >>= 1; - } - } - return distance; - }; - - const framesToCompare = Math.min(newHashes.length, dbHashes.length); - let matches = 0; - - for (let i = 0; i < framesToCompare; i++) { - const dist = getHammingDistance(newHashes[i], dbHashes[i]); - if (dist <= THRESHOLD) matches++; - } - - if (framesToCompare >= 3 && matches >= 2) return true; - if (framesToCompare === 1 && matches === 1) return true; - if (framesToCompare === 2 && matches >= 2) return true; - - return false; -} diff --git a/src/inc/queue.mjs b/src/inc/queue.mjs index cfe5853..43e5c86 100644 --- a/src/inc/queue.mjs +++ b/src/inc/queue.mjs @@ -6,6 +6,28 @@ import cfg from "./config.mjs"; import path from "path"; import os from "os"; +function isFlatFrame(buffer) { + if (!buffer || buffer.length !== 1056) return true; + let min = 255; + let max = 0; + let sum = 0; + for (let i = 0; i < buffer.length; i++) { + const val = buffer[i]; + if (val < min) min = val; + if (val > max) max = val; + sum += val; + } + const mean = sum / buffer.length; + if (mean < 15 || mean > 240) return true; + + let sqDiffSum = 0; + for (let i = 0; i < buffer.length; i++) { + sqDiffSum += Math.pow(buffer[i] - mean, 2); + } + const variance = sqDiffSum / buffer.length; + return variance < 10 || (max - min) < 15; +} + export default new class queue { constructor() { @@ -85,31 +107,52 @@ export default new class queue { async generatePHash(source) { try { // Temporal dHash implementation: - // 1. Get duration. - // 2. Extract 3 frames: 10%, 50%, 90%. - // 3. Generate dHash for each. - // 4. Return combined hash "hash1_hash2_hash3". + // 1. Check if source is image/video and get duration. + // 2. For videos: Extract 3 frames (10%, 50%, 90% of duration). + // For static images: Extract 1 frame. + // 3. Generate dHash for each valid non-flat frame. + // 4. Return combined hash "hash1_hash2_hash3" or single "hash". // Skip ffprobe for PDFs (which would fail with "Invalid data") if (source.toLowerCase().endsWith('.pdf')) { return null; } - const durationStr = (await this.spawn('ffprobe', ['-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', source])).stdout.trim(); - const duration = parseFloat(durationStr); - if (isNaN(duration) || duration <= 0) return null; + let isVideo = true; + let timestamps = []; + + try { + const durationStr = (await this.spawn('ffprobe', ['-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', source])).stdout.trim(); + const duration = parseFloat(durationStr); + if (isNaN(duration) || duration <= 0) { + isVideo = false; + } else { + timestamps = [duration * 0.1, duration * 0.5, duration * 0.9]; + } + } catch (err) { + isVideo = false; + } + + if (!isVideo) { + timestamps = [0]; // Process static image as single frame + } - const timestamps = [duration * 0.1, duration * 0.5, duration * 0.9]; const hashes = []; for (const ts of timestamps) { let buffer; try { - const { stdout } = await this.spawn('ffmpeg', ['-ss', ts.toString(), '-v', 'error', '-i', source, '-vf', 'thumbnail,scale=33:32,format=gray', '-frames:v', '1', '-f', 'rawvideo', 'pipe:1'], { encoding: 'buffer', quiet: true }); + const vf = isVideo ? 'thumbnail,scale=33:32,format=gray' : 'scale=33:32,format=gray'; + const args = []; + if (isVideo) { + args.push('-ss', ts.toString()); + } + args.push('-v', 'error', '-i', source, '-vf', vf, '-frames:v', '1', '-f', 'rawvideo', 'pipe:1'); + + const { stdout } = await this.spawn('ffmpeg', args, { encoding: 'buffer', quiet: true }); buffer = stdout; } catch (err) { console.warn(`[PHASH] Failed to extract frame at ${ts}s for ${source}: ${err.message}`); - // Buffer remains undefined, triggering fallback below } if (!buffer || buffer.length !== 1056) { @@ -117,6 +160,12 @@ export default new class queue { continue; } + // Filter out flat/black frames (e.g. solid color backgrounds, fade-to-black) + if (isFlatFrame(buffer)) { + console.log(`[PHASH] Ignored flat/black frame at ${ts}s for ${source}`); + continue; + } + let hash = ''; let currentByte = 0; let bitCount = 0; @@ -151,66 +200,80 @@ export default new class queue { async checkrepostphash(newHash) { if (!newHash) return false; - const newHashes = newHash.split('_'); + const newHashes = newHash.split('_').filter(s => s && !s.startsWith('00000000')); if (newHashes.length === 0) return false; - // Fetch all phashes, filtering out "all zero" failed hashes - const items = await db` - SELECT id, phash FROM items - WHERE phash IS NOT NULL - AND phash != '' - AND phash NOT LIKE '00000000%' + const h1 = newHashes[0] || ''; + const h2 = newHashes[1] || ''; + const h3 = newHashes[2] || ''; + + const results = await db` + SELECT id FROM items + WHERE phash IS NOT NULL AND phash != '' AND phash != 'ERROR' AND phash != 'MISSING' AND phash NOT LIKE '00000000%' + AND ( + ( + CASE WHEN split_part(phash, '_', 1) != '' AND ${h1} != '' THEN + bit_count(('x' || split_part(phash, '_', 1))::bit(1024) # ('x' || ${h1})::bit(1024)) <= 15 + ELSE false END::int + + + CASE WHEN split_part(phash, '_', 2) != '' AND ${h2} != '' THEN + bit_count(('x' || split_part(phash, '_', 2))::bit(1024) # ('x' || ${h2})::bit(1024)) <= 15 + ELSE false END::int + + + CASE WHEN split_part(phash, '_', 3) != '' AND ${h3} != '' THEN + bit_count(('x' || split_part(phash, '_', 3))::bit(1024) # ('x' || ${h3})::bit(1024)) <= 15 + ELSE false END::int + ) >= ( + CASE + WHEN split_part(phash, '_', 3) != '' AND ${h3} != '' THEN 2 + WHEN split_part(phash, '_', 2) != '' AND ${h2} != '' THEN 2 + ELSE 1 + END + ) + ) + LIMIT 1 `; - // Configurable threshold: max Hamming distance per 256-bit dHash frame. - // A value of 15 means < 6% bit difference — tight enough to only match true duplicates. - const THRESHOLD = 15; + return results.length > 0 ? results[0].id : false; + }; - const getHammingDistance = (h1, h2) => { - if (!h1 || !h2 || h1.length !== h2.length) return 9999; - let distance = 0; - for (let i = 0; i < h1.length; i += 2) { - const v1 = parseInt(h1.substr(i, 2), 16); - const v2 = parseInt(h2.substr(i, 2), 16); - let xor = v1 ^ v2; - while (xor) { - distance += xor & 1; - xor >>= 1; - } - } - return distance; - }; + async checkcommentrepostphash(newHash) { + if (!newHash) return false; + const newHashes = newHash.split('_').filter(s => s && !s.startsWith('00000000')); + if (newHashes.length === 0) return false; - // We want at least 2 out of 3 frames to match - const REQUIRED_MATCHES = 2; + const h1 = newHashes[0] || ''; + const h2 = newHashes[1] || ''; + const h3 = newHashes[2] || ''; - for (const item of items) { - // Handle legacy single hashes vs new multi-hashes - const dbHashes = item.phash.split('_'); + const results = await db` + SELECT id, dest FROM comment_files + WHERE phash IS NOT NULL AND phash != '' AND phash NOT LIKE '00000000%' + AND ( + ( + CASE WHEN split_part(phash, '_', 1) != '' AND ${h1} != '' THEN + bit_count(('x' || split_part(phash, '_', 1))::bit(1024) # ('x' || ${h1})::bit(1024)) <= 15 + ELSE false END::int + + + CASE WHEN split_part(phash, '_', 2) != '' AND ${h2} != '' THEN + bit_count(('x' || split_part(phash, '_', 2))::bit(1024) # ('x' || ${h2})::bit(1024)) <= 15 + ELSE false END::int + + + CASE WHEN split_part(phash, '_', 3) != '' AND ${h3} != '' THEN + bit_count(('x' || split_part(phash, '_', 3))::bit(1024) # ('x' || ${h3})::bit(1024)) <= 15 + ELSE false END::int + ) >= ( + CASE + WHEN split_part(phash, '_', 3) != '' AND ${h3} != '' THEN 2 + WHEN split_part(phash, '_', 2) != '' AND ${h2} != '' THEN 2 + ELSE 1 + END + ) + ) + LIMIT 1 + `; - let matches = 0; - // Compare corresponding frames: 0vs0, 1vs1, 2vs2 - const framesToCompare = Math.min(newHashes.length, dbHashes.length); - - for (let i = 0; i < framesToCompare; i++) { - const dist = getHammingDistance(newHashes[i], dbHashes[i]); - if (dist <= THRESHOLD) { - matches++; - } - } - - // If we have 3 frames, require 2 out of 3 matches. - // If we are comparing against a legacy 1-frame hash, require that single frame to match. - if (framesToCompare >= 3 && matches >= REQUIRED_MATCHES) { - return item.id; - } else if (framesToCompare === 1 && matches === 1) { - return item.id; - } else if (framesToCompare === 2 && matches >= 2) { - return item.id; - } - } - - return false; + return results.length > 0 ? results[0] : false; }; async genuuid() {