updating phash/dhash generation

This commit is contained in:
2026-05-20 19:10:50 +02:00
parent 07edfcb71d
commit d0a014705b
2 changed files with 140 additions and 120 deletions

View File

@@ -374,29 +374,23 @@ export const handleCommentUpload = async (req, res) => {
try {
phash = await queue.generatePHash(tmpPath);
if (phash && !linkedToExisting) {
// Check comment_files for visual duplicate
const cfItems = await db`
SELECT id, phash, dest FROM comment_files
WHERE phash IS NOT NULL AND phash != '' AND phash NOT LIKE '00000000%'
`;
for (const cf of cfItems) {
if (isPhashMatch(phash, cf.phash)) {
const existingAbsPath = path.join(cfg.paths.c, cf.dest);
try {
const realTarget = await fs.realpath(existingAbsPath);
const destPath = path.join(cfg.paths.c, filename);
const relTarget = path.relative(path.dirname(destPath), realTarget);
await fs.symlink(relTarget, destPath);
linkedToExisting = true;
console.log(`[COMMENT_UPLOAD] PHash match in comment_files: ${filename}${relTarget}`);
} catch (e) {
console.error(`[COMMENT_UPLOAD] PHash symlink failed:`, e.message);
}
break;
// Check comment_files for visual duplicate using fast SQL query
const commentMatch = await queue.checkcommentrepostphash(phash);
if (commentMatch) {
const existingAbsPath = path.join(cfg.paths.c, commentMatch.dest);
try {
const realTarget = await fs.realpath(existingAbsPath);
const destPath = path.join(cfg.paths.c, filename);
const relTarget = path.relative(path.dirname(destPath), realTarget);
await fs.symlink(relTarget, destPath);
linkedToExisting = true;
console.log(`[COMMENT_UPLOAD] PHash match in comment_files: ${filename}${relTarget}`);
} catch (e) {
console.error(`[COMMENT_UPLOAD] PHash symlink failed:`, e.message);
}
}
// Also check items table for visual duplicate
// Also check items table for visual duplicate using fast SQL query
if (!linkedToExisting) {
const phashMatch = await queue.checkrepostphash(phash);
if (phashMatch) {
@@ -541,41 +535,4 @@ async function generateCommentThumbnail(filename, mime, uuid, size = 512) {
await fs.unlink(tmpFile).catch(() => { });
}
/**
* PHash matching helper (same logic as queue.checkrepostphash)
*/
function isPhashMatch(newHash, dbHash) {
if (!newHash || !dbHash) return false;
const newHashes = newHash.split('_');
const dbHashes = dbHash.split('_');
const THRESHOLD = 15;
const getHammingDistance = (h1, h2) => {
if (!h1 || !h2 || h1.length !== h2.length) return 9999;
let distance = 0;
for (let i = 0; i < h1.length; i += 2) {
const v1 = parseInt(h1.substr(i, 2), 16);
const v2 = parseInt(h2.substr(i, 2), 16);
let xor = v1 ^ v2;
while (xor) {
distance += xor & 1;
xor >>= 1;
}
}
return distance;
};
const framesToCompare = Math.min(newHashes.length, dbHashes.length);
let matches = 0;
for (let i = 0; i < framesToCompare; i++) {
const dist = getHammingDistance(newHashes[i], dbHashes[i]);
if (dist <= THRESHOLD) matches++;
}
if (framesToCompare >= 3 && matches >= 2) return true;
if (framesToCompare === 1 && matches === 1) return true;
if (framesToCompare === 2 && matches >= 2) return true;
return false;
}

View File

@@ -6,6 +6,28 @@ import cfg from "./config.mjs";
import path from "path";
import os from "os";
function isFlatFrame(buffer) {
if (!buffer || buffer.length !== 1056) return true;
let min = 255;
let max = 0;
let sum = 0;
for (let i = 0; i < buffer.length; i++) {
const val = buffer[i];
if (val < min) min = val;
if (val > max) max = val;
sum += val;
}
const mean = sum / buffer.length;
if (mean < 15 || mean > 240) return true;
let sqDiffSum = 0;
for (let i = 0; i < buffer.length; i++) {
sqDiffSum += Math.pow(buffer[i] - mean, 2);
}
const variance = sqDiffSum / buffer.length;
return variance < 10 || (max - min) < 15;
}
export default new class queue {
constructor() {
@@ -85,31 +107,52 @@ export default new class queue {
async generatePHash(source) {
try {
// Temporal dHash implementation:
// 1. Get duration.
// 2. Extract 3 frames: 10%, 50%, 90%.
// 3. Generate dHash for each.
// 4. Return combined hash "hash1_hash2_hash3".
// 1. Check if source is image/video and get duration.
// 2. For videos: Extract 3 frames (10%, 50%, 90% of duration).
// For static images: Extract 1 frame.
// 3. Generate dHash for each valid non-flat frame.
// 4. Return combined hash "hash1_hash2_hash3" or single "hash".
// Skip ffprobe for PDFs (which would fail with "Invalid data")
if (source.toLowerCase().endsWith('.pdf')) {
return null;
}
const durationStr = (await this.spawn('ffprobe', ['-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', source])).stdout.trim();
const duration = parseFloat(durationStr);
if (isNaN(duration) || duration <= 0) return null;
let isVideo = true;
let timestamps = [];
try {
const durationStr = (await this.spawn('ffprobe', ['-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', source])).stdout.trim();
const duration = parseFloat(durationStr);
if (isNaN(duration) || duration <= 0) {
isVideo = false;
} else {
timestamps = [duration * 0.1, duration * 0.5, duration * 0.9];
}
} catch (err) {
isVideo = false;
}
if (!isVideo) {
timestamps = [0]; // Process static image as single frame
}
const timestamps = [duration * 0.1, duration * 0.5, duration * 0.9];
const hashes = [];
for (const ts of timestamps) {
let buffer;
try {
const { stdout } = await this.spawn('ffmpeg', ['-ss', ts.toString(), '-v', 'error', '-i', source, '-vf', 'thumbnail,scale=33:32,format=gray', '-frames:v', '1', '-f', 'rawvideo', 'pipe:1'], { encoding: 'buffer', quiet: true });
const vf = isVideo ? 'thumbnail,scale=33:32,format=gray' : 'scale=33:32,format=gray';
const args = [];
if (isVideo) {
args.push('-ss', ts.toString());
}
args.push('-v', 'error', '-i', source, '-vf', vf, '-frames:v', '1', '-f', 'rawvideo', 'pipe:1');
const { stdout } = await this.spawn('ffmpeg', args, { encoding: 'buffer', quiet: true });
buffer = stdout;
} catch (err) {
console.warn(`[PHASH] Failed to extract frame at ${ts}s for ${source}: ${err.message}`);
// Buffer remains undefined, triggering fallback below
}
if (!buffer || buffer.length !== 1056) {
@@ -117,6 +160,12 @@ export default new class queue {
continue;
}
// Filter out flat/black frames (e.g. solid color backgrounds, fade-to-black)
if (isFlatFrame(buffer)) {
console.log(`[PHASH] Ignored flat/black frame at ${ts}s for ${source}`);
continue;
}
let hash = '';
let currentByte = 0;
let bitCount = 0;
@@ -151,66 +200,80 @@ export default new class queue {
async checkrepostphash(newHash) {
if (!newHash) return false;
const newHashes = newHash.split('_');
const newHashes = newHash.split('_').filter(s => s && !s.startsWith('00000000'));
if (newHashes.length === 0) return false;
// Fetch all phashes, filtering out "all zero" failed hashes
const items = await db`
SELECT id, phash FROM items
WHERE phash IS NOT NULL
AND phash != ''
AND phash NOT LIKE '00000000%'
const h1 = newHashes[0] || '';
const h2 = newHashes[1] || '';
const h3 = newHashes[2] || '';
const results = await db`
SELECT id FROM items
WHERE phash IS NOT NULL AND phash != '' AND phash != 'ERROR' AND phash != 'MISSING' AND phash NOT LIKE '00000000%'
AND (
(
CASE WHEN split_part(phash, '_', 1) != '' AND ${h1} != '' THEN
bit_count(('x' || split_part(phash, '_', 1))::bit(1024) # ('x' || ${h1})::bit(1024)) <= 15
ELSE false END::int
+
CASE WHEN split_part(phash, '_', 2) != '' AND ${h2} != '' THEN
bit_count(('x' || split_part(phash, '_', 2))::bit(1024) # ('x' || ${h2})::bit(1024)) <= 15
ELSE false END::int
+
CASE WHEN split_part(phash, '_', 3) != '' AND ${h3} != '' THEN
bit_count(('x' || split_part(phash, '_', 3))::bit(1024) # ('x' || ${h3})::bit(1024)) <= 15
ELSE false END::int
) >= (
CASE
WHEN split_part(phash, '_', 3) != '' AND ${h3} != '' THEN 2
WHEN split_part(phash, '_', 2) != '' AND ${h2} != '' THEN 2
ELSE 1
END
)
)
LIMIT 1
`;
// Configurable threshold: max Hamming distance per 256-bit dHash frame.
// A value of 15 means < 6% bit difference — tight enough to only match true duplicates.
const THRESHOLD = 15;
return results.length > 0 ? results[0].id : false;
};
const getHammingDistance = (h1, h2) => {
if (!h1 || !h2 || h1.length !== h2.length) return 9999;
let distance = 0;
for (let i = 0; i < h1.length; i += 2) {
const v1 = parseInt(h1.substr(i, 2), 16);
const v2 = parseInt(h2.substr(i, 2), 16);
let xor = v1 ^ v2;
while (xor) {
distance += xor & 1;
xor >>= 1;
}
}
return distance;
};
async checkcommentrepostphash(newHash) {
if (!newHash) return false;
const newHashes = newHash.split('_').filter(s => s && !s.startsWith('00000000'));
if (newHashes.length === 0) return false;
// We want at least 2 out of 3 frames to match
const REQUIRED_MATCHES = 2;
const h1 = newHashes[0] || '';
const h2 = newHashes[1] || '';
const h3 = newHashes[2] || '';
for (const item of items) {
// Handle legacy single hashes vs new multi-hashes
const dbHashes = item.phash.split('_');
const results = await db`
SELECT id, dest FROM comment_files
WHERE phash IS NOT NULL AND phash != '' AND phash NOT LIKE '00000000%'
AND (
(
CASE WHEN split_part(phash, '_', 1) != '' AND ${h1} != '' THEN
bit_count(('x' || split_part(phash, '_', 1))::bit(1024) # ('x' || ${h1})::bit(1024)) <= 15
ELSE false END::int
+
CASE WHEN split_part(phash, '_', 2) != '' AND ${h2} != '' THEN
bit_count(('x' || split_part(phash, '_', 2))::bit(1024) # ('x' || ${h2})::bit(1024)) <= 15
ELSE false END::int
+
CASE WHEN split_part(phash, '_', 3) != '' AND ${h3} != '' THEN
bit_count(('x' || split_part(phash, '_', 3))::bit(1024) # ('x' || ${h3})::bit(1024)) <= 15
ELSE false END::int
) >= (
CASE
WHEN split_part(phash, '_', 3) != '' AND ${h3} != '' THEN 2
WHEN split_part(phash, '_', 2) != '' AND ${h2} != '' THEN 2
ELSE 1
END
)
)
LIMIT 1
`;
let matches = 0;
// Compare corresponding frames: 0vs0, 1vs1, 2vs2
const framesToCompare = Math.min(newHashes.length, dbHashes.length);
for (let i = 0; i < framesToCompare; i++) {
const dist = getHammingDistance(newHashes[i], dbHashes[i]);
if (dist <= THRESHOLD) {
matches++;
}
}
// If we have 3 frames, require 2 out of 3 matches.
// If we are comparing against a legacy 1-frame hash, require that single frame to match.
if (framesToCompare >= 3 && matches >= REQUIRED_MATCHES) {
return item.id;
} else if (framesToCompare === 1 && matches === 1) {
return item.id;
} else if (framesToCompare === 2 && matches >= 2) {
return item.id;
}
}
return false;
return results.length > 0 ? results[0] : false;
};
async genuuid() {