Files
f0ckm/debug/find_duplicates.mjs
2026-04-25 19:51:52 +02:00

105 lines
3.3 KiB
JavaScript

import db from "../src/inc/sql.mjs";
const THRESHOLD = 15;
const REQUIRED_MATCHES = 2;
// Hamming distance helper — operates on a single hex-encoded hash segment
const getHammingDistance = (h1, h2) => {
if (!h1 || !h2 || h1.length !== h2.length) return 9999;
let distance = 0;
for (let i = 0; i < h1.length; i += 2) {
const v1 = parseInt(h1.substr(i, 2), 16);
const v2 = parseInt(h2.substr(i, 2), 16);
let xor = v1 ^ v2;
while (xor) {
distance += xor & 1;
xor >>= 1;
}
}
return distance;
};
async function findDuplicates() {
console.log("Fetching items...");
// Fetch all valid phashes
const items = await db`
SELECT id, phash
FROM items
WHERE phash IS NOT NULL
AND phash != ''
AND phash != 'MISSING'
AND phash != 'ERROR'
AND phash NOT LIKE '00000000%'
ORDER BY id ASC
`;
console.log(`Checking ${items.length} items for duplicates (Threshold: ${THRESHOLD}, Required frame matches: ${REQUIRED_MATCHES})...`);
const duplicates = new Map(); // Map<OriginalID, List<{id, dist}>>
const processed = new Set();
for (let i = 0; i < items.length; i++) {
const current = items[i];
if (processed.has(current.id)) continue;
const matchList = [];
for (let j = i + 1; j < items.length; j++) {
const compare = items[j];
if (processed.has(compare.id)) continue;
// Split multi-frame hashes properly — do NOT compare the whole string
const aHashes = current.phash.split('_');
const bHashes = compare.phash.split('_');
const framesToCompare = Math.min(aHashes.length, bHashes.length);
let matchCount = 0;
for (let f = 0; f < framesToCompare; f++) {
const dist = getHammingDistance(aHashes[f], bHashes[f]);
if (dist <= THRESHOLD) matchCount++;
}
const isMatch = (framesToCompare >= 3 && matchCount >= REQUIRED_MATCHES)
|| (framesToCompare === 2 && matchCount >= 2)
|| (framesToCompare === 1 && matchCount === 1);
if (isMatch) {
const avgDist = Math.round(
aHashes.slice(0, framesToCompare)
.reduce((sum, h, idx) => sum + getHammingDistance(h, bHashes[idx]), 0)
/ framesToCompare
);
matchList.push({ id: compare.id, dist: avgDist });
processed.add(compare.id);
}
}
if (matchList.length > 0) {
duplicates.set(current.id, matchList);
processed.add(current.id);
}
}
if (duplicates.size === 0) {
console.log("No duplicates found.");
} else {
console.log(`Found ${duplicates.size} duplicate sets:`);
console.log("---------------------------------------------------");
}
for (const [originalId, matchList] of duplicates.entries()) {
const matchStr = matchList.map(m => `ID:${m.id} (avg-dist:${m.dist})`).join(", ");
console.log(`Original ID: ${originalId} matches with: ${matchStr}`);
}
process.exit(0);
}
findDuplicates().catch(err => {
console.error(err);
process.exit(1);
});