updating phash/dhash generation

This commit is contained in:
2026-05-20 19:10:50 +02:00
parent 07edfcb71d
commit d0a014705b
2 changed files with 140 additions and 120 deletions

View File

@@ -374,14 +374,10 @@ export const handleCommentUpload = async (req, res) => {
try { try {
phash = await queue.generatePHash(tmpPath); phash = await queue.generatePHash(tmpPath);
if (phash && !linkedToExisting) { if (phash && !linkedToExisting) {
// Check comment_files for visual duplicate // Check comment_files for visual duplicate using fast SQL query
const cfItems = await db` const commentMatch = await queue.checkcommentrepostphash(phash);
SELECT id, phash, dest FROM comment_files if (commentMatch) {
WHERE phash IS NOT NULL AND phash != '' AND phash NOT LIKE '00000000%' const existingAbsPath = path.join(cfg.paths.c, commentMatch.dest);
`;
for (const cf of cfItems) {
if (isPhashMatch(phash, cf.phash)) {
const existingAbsPath = path.join(cfg.paths.c, cf.dest);
try { try {
const realTarget = await fs.realpath(existingAbsPath); const realTarget = await fs.realpath(existingAbsPath);
const destPath = path.join(cfg.paths.c, filename); const destPath = path.join(cfg.paths.c, filename);
@@ -392,11 +388,9 @@ export const handleCommentUpload = async (req, res) => {
} catch (e) { } catch (e) {
console.error(`[COMMENT_UPLOAD] PHash symlink failed:`, e.message); console.error(`[COMMENT_UPLOAD] PHash symlink failed:`, e.message);
} }
break;
}
} }
// Also check items table for visual duplicate // Also check items table for visual duplicate using fast SQL query
if (!linkedToExisting) { if (!linkedToExisting) {
const phashMatch = await queue.checkrepostphash(phash); const phashMatch = await queue.checkrepostphash(phash);
if (phashMatch) { if (phashMatch) {
@@ -541,41 +535,4 @@ async function generateCommentThumbnail(filename, mime, uuid, size = 512) {
await fs.unlink(tmpFile).catch(() => { }); await fs.unlink(tmpFile).catch(() => { });
} }
/**
* PHash matching helper (same logic as queue.checkrepostphash)
*/
function isPhashMatch(newHash, dbHash) {
if (!newHash || !dbHash) return false;
const newHashes = newHash.split('_');
const dbHashes = dbHash.split('_');
const THRESHOLD = 15;
const getHammingDistance = (h1, h2) => {
if (!h1 || !h2 || h1.length !== h2.length) return 9999;
let distance = 0;
for (let i = 0; i < h1.length; i += 2) {
const v1 = parseInt(h1.substr(i, 2), 16);
const v2 = parseInt(h2.substr(i, 2), 16);
let xor = v1 ^ v2;
while (xor) {
distance += xor & 1;
xor >>= 1;
}
}
return distance;
};
const framesToCompare = Math.min(newHashes.length, dbHashes.length);
let matches = 0;
for (let i = 0; i < framesToCompare; i++) {
const dist = getHammingDistance(newHashes[i], dbHashes[i]);
if (dist <= THRESHOLD) matches++;
}
if (framesToCompare >= 3 && matches >= 2) return true;
if (framesToCompare === 1 && matches === 1) return true;
if (framesToCompare === 2 && matches >= 2) return true;
return false;
}

View File

@@ -6,6 +6,28 @@ import cfg from "./config.mjs";
import path from "path"; import path from "path";
import os from "os"; import os from "os";
function isFlatFrame(buffer) {
if (!buffer || buffer.length !== 1056) return true;
let min = 255;
let max = 0;
let sum = 0;
for (let i = 0; i < buffer.length; i++) {
const val = buffer[i];
if (val < min) min = val;
if (val > max) max = val;
sum += val;
}
const mean = sum / buffer.length;
if (mean < 15 || mean > 240) return true;
let sqDiffSum = 0;
for (let i = 0; i < buffer.length; i++) {
sqDiffSum += Math.pow(buffer[i] - mean, 2);
}
const variance = sqDiffSum / buffer.length;
return variance < 10 || (max - min) < 15;
}
export default new class queue { export default new class queue {
constructor() { constructor() {
@@ -85,31 +107,52 @@ export default new class queue {
async generatePHash(source) { async generatePHash(source) {
try { try {
// Temporal dHash implementation: // Temporal dHash implementation:
// 1. Get duration. // 1. Check if source is image/video and get duration.
// 2. Extract 3 frames: 10%, 50%, 90%. // 2. For videos: Extract 3 frames (10%, 50%, 90% of duration).
// 3. Generate dHash for each. // For static images: Extract 1 frame.
// 4. Return combined hash "hash1_hash2_hash3". // 3. Generate dHash for each valid non-flat frame.
// 4. Return combined hash "hash1_hash2_hash3" or single "hash".
// Skip ffprobe for PDFs (which would fail with "Invalid data") // Skip ffprobe for PDFs (which would fail with "Invalid data")
if (source.toLowerCase().endsWith('.pdf')) { if (source.toLowerCase().endsWith('.pdf')) {
return null; return null;
} }
let isVideo = true;
let timestamps = [];
try {
const durationStr = (await this.spawn('ffprobe', ['-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', source])).stdout.trim(); const durationStr = (await this.spawn('ffprobe', ['-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', source])).stdout.trim();
const duration = parseFloat(durationStr); const duration = parseFloat(durationStr);
if (isNaN(duration) || duration <= 0) return null; if (isNaN(duration) || duration <= 0) {
isVideo = false;
} else {
timestamps = [duration * 0.1, duration * 0.5, duration * 0.9];
}
} catch (err) {
isVideo = false;
}
if (!isVideo) {
timestamps = [0]; // Process static image as single frame
}
const timestamps = [duration * 0.1, duration * 0.5, duration * 0.9];
const hashes = []; const hashes = [];
for (const ts of timestamps) { for (const ts of timestamps) {
let buffer; let buffer;
try { try {
const { stdout } = await this.spawn('ffmpeg', ['-ss', ts.toString(), '-v', 'error', '-i', source, '-vf', 'thumbnail,scale=33:32,format=gray', '-frames:v', '1', '-f', 'rawvideo', 'pipe:1'], { encoding: 'buffer', quiet: true }); const vf = isVideo ? 'thumbnail,scale=33:32,format=gray' : 'scale=33:32,format=gray';
const args = [];
if (isVideo) {
args.push('-ss', ts.toString());
}
args.push('-v', 'error', '-i', source, '-vf', vf, '-frames:v', '1', '-f', 'rawvideo', 'pipe:1');
const { stdout } = await this.spawn('ffmpeg', args, { encoding: 'buffer', quiet: true });
buffer = stdout; buffer = stdout;
} catch (err) { } catch (err) {
console.warn(`[PHASH] Failed to extract frame at ${ts}s for ${source}: ${err.message}`); console.warn(`[PHASH] Failed to extract frame at ${ts}s for ${source}: ${err.message}`);
// Buffer remains undefined, triggering fallback below
} }
if (!buffer || buffer.length !== 1056) { if (!buffer || buffer.length !== 1056) {
@@ -117,6 +160,12 @@ export default new class queue {
continue; continue;
} }
// Filter out flat/black frames (e.g. solid color backgrounds, fade-to-black)
if (isFlatFrame(buffer)) {
console.log(`[PHASH] Ignored flat/black frame at ${ts}s for ${source}`);
continue;
}
let hash = ''; let hash = '';
let currentByte = 0; let currentByte = 0;
let bitCount = 0; let bitCount = 0;
@@ -151,66 +200,80 @@ export default new class queue {
async checkrepostphash(newHash) { async checkrepostphash(newHash) {
if (!newHash) return false; if (!newHash) return false;
const newHashes = newHash.split('_'); const newHashes = newHash.split('_').filter(s => s && !s.startsWith('00000000'));
if (newHashes.length === 0) return false; if (newHashes.length === 0) return false;
// Fetch all phashes, filtering out "all zero" failed hashes const h1 = newHashes[0] || '';
const items = await db` const h2 = newHashes[1] || '';
SELECT id, phash FROM items const h3 = newHashes[2] || '';
WHERE phash IS NOT NULL
AND phash != '' const results = await db`
AND phash NOT LIKE '00000000%' SELECT id FROM items
WHERE phash IS NOT NULL AND phash != '' AND phash != 'ERROR' AND phash != 'MISSING' AND phash NOT LIKE '00000000%'
AND (
(
CASE WHEN split_part(phash, '_', 1) != '' AND ${h1} != '' THEN
bit_count(('x' || split_part(phash, '_', 1))::bit(1024) # ('x' || ${h1})::bit(1024)) <= 15
ELSE false END::int
+
CASE WHEN split_part(phash, '_', 2) != '' AND ${h2} != '' THEN
bit_count(('x' || split_part(phash, '_', 2))::bit(1024) # ('x' || ${h2})::bit(1024)) <= 15
ELSE false END::int
+
CASE WHEN split_part(phash, '_', 3) != '' AND ${h3} != '' THEN
bit_count(('x' || split_part(phash, '_', 3))::bit(1024) # ('x' || ${h3})::bit(1024)) <= 15
ELSE false END::int
) >= (
CASE
WHEN split_part(phash, '_', 3) != '' AND ${h3} != '' THEN 2
WHEN split_part(phash, '_', 2) != '' AND ${h2} != '' THEN 2
ELSE 1
END
)
)
LIMIT 1
`; `;
// Configurable threshold: max Hamming distance per 256-bit dHash frame. return results.length > 0 ? results[0].id : false;
// A value of 15 means < 6% bit difference — tight enough to only match true duplicates.
const THRESHOLD = 15;
const getHammingDistance = (h1, h2) => {
if (!h1 || !h2 || h1.length !== h2.length) return 9999;
let distance = 0;
for (let i = 0; i < h1.length; i += 2) {
const v1 = parseInt(h1.substr(i, 2), 16);
const v2 = parseInt(h2.substr(i, 2), 16);
let xor = v1 ^ v2;
while (xor) {
distance += xor & 1;
xor >>= 1;
}
}
return distance;
}; };
// We want at least 2 out of 3 frames to match async checkcommentrepostphash(newHash) {
const REQUIRED_MATCHES = 2; if (!newHash) return false;
const newHashes = newHash.split('_').filter(s => s && !s.startsWith('00000000'));
if (newHashes.length === 0) return false;
for (const item of items) { const h1 = newHashes[0] || '';
// Handle legacy single hashes vs new multi-hashes const h2 = newHashes[1] || '';
const dbHashes = item.phash.split('_'); const h3 = newHashes[2] || '';
let matches = 0; const results = await db`
// Compare corresponding frames: 0vs0, 1vs1, 2vs2 SELECT id, dest FROM comment_files
const framesToCompare = Math.min(newHashes.length, dbHashes.length); WHERE phash IS NOT NULL AND phash != '' AND phash NOT LIKE '00000000%'
AND (
(
CASE WHEN split_part(phash, '_', 1) != '' AND ${h1} != '' THEN
bit_count(('x' || split_part(phash, '_', 1))::bit(1024) # ('x' || ${h1})::bit(1024)) <= 15
ELSE false END::int
+
CASE WHEN split_part(phash, '_', 2) != '' AND ${h2} != '' THEN
bit_count(('x' || split_part(phash, '_', 2))::bit(1024) # ('x' || ${h2})::bit(1024)) <= 15
ELSE false END::int
+
CASE WHEN split_part(phash, '_', 3) != '' AND ${h3} != '' THEN
bit_count(('x' || split_part(phash, '_', 3))::bit(1024) # ('x' || ${h3})::bit(1024)) <= 15
ELSE false END::int
) >= (
CASE
WHEN split_part(phash, '_', 3) != '' AND ${h3} != '' THEN 2
WHEN split_part(phash, '_', 2) != '' AND ${h2} != '' THEN 2
ELSE 1
END
)
)
LIMIT 1
`;
for (let i = 0; i < framesToCompare; i++) { return results.length > 0 ? results[0] : false;
const dist = getHammingDistance(newHashes[i], dbHashes[i]);
if (dist <= THRESHOLD) {
matches++;
}
}
// If we have 3 frames, require 2 out of 3 matches.
// If we are comparing against a legacy 1-frame hash, require that single frame to match.
if (framesToCompare >= 3 && matches >= REQUIRED_MATCHES) {
return item.id;
} else if (framesToCompare === 1 && matches === 1) {
return item.id;
} else if (framesToCompare === 2 && matches >= 2) {
return item.id;
}
}
return false;
}; };
async genuuid() { async genuuid() {