e3a380bbe4
Former-commit-id: 21906adfdf35e5deecb10eb286a5b09b706f20a4
1103 lines
31 KiB
C++
1103 lines
31 KiB
C++
/**
|
|
* vim: set ts=4 :
|
|
* =============================================================================
|
|
* SourceMod
|
|
* Copyright (C) 2004-2008 AlliedModders LLC. All rights reserved.
|
|
* =============================================================================
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify it under
|
|
* the terms of the GNU General Public License, version 3.0, as published by the
|
|
* Free Software Foundation.
|
|
*
|
|
* This program is distributed in the hope that it will be useful, but WITHOUT
|
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
|
|
* details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License along with
|
|
* this program. If not, see <http://www.gnu.org/licenses/>.
|
|
*
|
|
* As a special exception, AlliedModders LLC gives you permission to link the
|
|
* code of this program (as well as its derivative works) to "Half-Life 2," the
|
|
* "Source Engine," the "SourcePawn JIT," and any Game MODs that run on software
|
|
* by the Valve Corporation. You must obey the GNU General Public License in
|
|
* all respects for all other code used. Additionally, AlliedModders LLC grants
|
|
* this exception to all derivative works. AlliedModders LLC defines further
|
|
* exceptions, found in LICENSE.txt (as of this writing, version JULY-31-2007),
|
|
* or <http://www.sourcemod.net/license.php>.
|
|
*
|
|
* Version: $Id$
|
|
*/
|
|
|
|
#ifndef _INCLUDE_SOURCEMOD_TEMPLATED_TRIE_H_
|
|
#define _INCLUDE_SOURCEMOD_TEMPLATED_TRIE_H_
|
|
|
|
#include <new>
|
|
#include <string.h>
|
|
#include <stdlib.h>
|
|
#include <assert.h>
|
|
|
|
enum NodeType
|
|
{
|
|
Node_Unused = 0, /* Node is not being used (sparse) */
|
|
Node_Arc, /* Node is part of an arc and does not terminate */
|
|
Node_Term, /* Node is a terminator */
|
|
};
|
|
|
|
/**
|
|
* @brief Trie class for storing key/value pairs, based on double array tries.
|
|
* @file sm_trie_tpl.h
|
|
*
|
|
* For full works cited and implementation overview, there is a big comment
|
|
* block at the bottom of this file.
|
|
*/
|
|
|
|
template <typename K>
|
|
class KTrie
|
|
{
|
|
class KTrieNode;
|
|
public:
|
|
/**
|
|
* @brief Clears all set objects in the trie.
|
|
*/
|
|
void clear()
|
|
{
|
|
run_destructors();
|
|
internal_clear();
|
|
}
|
|
|
|
/**
|
|
* @brief Removes a key from the trie.
|
|
*
|
|
* @param key Key to remove.
|
|
* @return True on success, false if key was never set.
|
|
*/
|
|
bool remove(const char *key)
|
|
{
|
|
KTrieNode *node = internal_retrieve(key);
|
|
if (!node || !node->valset)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
node->value.~K();
|
|
node->valset = false;
|
|
|
|
m_numElements--;
|
|
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* @brief Retrieves a pointer to the object stored at a given key.
|
|
*
|
|
* @param key Key to retrieve.
|
|
* @return Pointer to object, or NULL if key was not found or not set.
|
|
*/
|
|
K * retrieve(const char *key)
|
|
{
|
|
KTrieNode *node = internal_retrieve(key);
|
|
if (!node || !node->valset)
|
|
{
|
|
return NULL;
|
|
}
|
|
return &node->value;
|
|
}
|
|
|
|
/**
|
|
* @brief Inserts or updates the object stored at a key.
|
|
*
|
|
* @param key Key to update or insert.
|
|
* @param obj Object to store at the key.
|
|
* @return True on success, false on failure.
|
|
*/
|
|
bool replace(const char *key, const K & obj)
|
|
{
|
|
KTrieNode *prev_node = internal_retrieve(key);
|
|
if (!prev_node)
|
|
{
|
|
return insert(key, obj);
|
|
}
|
|
|
|
if (prev_node->valset)
|
|
{
|
|
prev_node->value.~K();
|
|
}
|
|
|
|
new (&prev_node->value) K(obj);
|
|
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* @brief Inserts an object at a key.
|
|
*
|
|
* @param key Key to insert at.
|
|
* @param obj Object to store at the key.
|
|
* @return True on success, false if the key is already set or
|
|
* insertion otherwise failed.
|
|
*/
|
|
bool insert(const char *key, const K & obj)
|
|
{
|
|
unsigned int lastidx = 1; /* the last node index */
|
|
unsigned int curidx; /* current node index */
|
|
const char *keyptr = key; /* input stream at current token */
|
|
KTrieNode *node = NULL; /* current node being processed */
|
|
//KTrieNode *basenode = NULL; /* current base node being processed */
|
|
unsigned int q; /* temporary var for x_check results */
|
|
unsigned int curoffs; /* current offset */
|
|
|
|
/**
|
|
* Empty strings are a special case, since there are no productions. We could
|
|
* probably rework it to use BASE[0] but this hack is easier.
|
|
*/
|
|
if (*key == '\0')
|
|
{
|
|
if (m_empty != NULL && m_empty->valset)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
if (m_empty == NULL)
|
|
{
|
|
m_empty = (KTrieNode *)malloc(sizeof(KTrieNode));
|
|
}
|
|
|
|
m_empty->valset = true;
|
|
new (&m_empty->value) K(obj);
|
|
|
|
m_numElements++;
|
|
|
|
return true;
|
|
}
|
|
|
|
/* Start traversing at the root node (1) */
|
|
do
|
|
{
|
|
/* Find where the next character is, then advance */
|
|
curidx = m_base[lastidx].idx;
|
|
//basenode = &m_base[curidx];
|
|
curoffs = charval(*keyptr);
|
|
curidx += curoffs;
|
|
node = &m_base[curidx];
|
|
keyptr++;
|
|
|
|
/* Check if this slot is supposed to be empty. If so, we need to handle CASES 1/2:
|
|
* Insertion without collisions
|
|
*/
|
|
if ( (curidx > m_baseSize) || (node->mode == Node_Unused) )
|
|
{
|
|
if (curidx > m_baseSize)
|
|
{
|
|
if (!grow())
|
|
{
|
|
return false;
|
|
}
|
|
node = &m_base[curidx];
|
|
}
|
|
node->parent = lastidx;
|
|
if (*keyptr == '\0')
|
|
{
|
|
node->mode = Node_Arc;
|
|
}
|
|
else
|
|
{
|
|
node->idx = x_addstring(keyptr);
|
|
node->mode = Node_Term;
|
|
}
|
|
node->valset = true;
|
|
new (&node->value) K(obj);
|
|
|
|
m_numElements++;
|
|
|
|
return true;
|
|
}
|
|
else if (node->parent != lastidx)
|
|
{
|
|
/* Collision! We have to split up the tree here. CASE 4:
|
|
* Insertion when a new word is inserted with a collision.
|
|
* NOTE: This is the hardest case to handle. All below examples are based on:
|
|
* BACHELOR, BADGE, inserting BABY.
|
|
* The problematic production here is A -> B, where B is already being used.
|
|
*
|
|
* This process has to rotate one half of the 'A' arc. We generate two lists:
|
|
* Outgoing Arcs - Anything leaving this 'A'
|
|
* Incoming Arcs - Anything going to this 'A'
|
|
* Whichever list is smaller will be moved. Note that this works because the intersection
|
|
* affects both arc chains, and moving one will make the slot available to either.
|
|
*/
|
|
KTrieNode *cur;
|
|
|
|
/* Find every node arcing from the last node.
|
|
* I.e. for BACHELOR, BADGE, BABY,
|
|
* The arcs leaving A will be C and D, but our current node is B -> *.
|
|
* Thus, we use the last index (A) to find the base for arcs leaving A.
|
|
*/
|
|
unsigned int outgoing_base = m_base[lastidx].idx;
|
|
unsigned int outgoing_list[256];
|
|
unsigned int outgoing_count = 0; /* count the current index here */
|
|
cur = &m_base[outgoing_base] + 1;
|
|
unsigned int outgoing_limit = 255;
|
|
|
|
if (outgoing_base + outgoing_limit > m_baseSize)
|
|
{
|
|
outgoing_limit = m_baseSize - outgoing_base;
|
|
}
|
|
|
|
for (unsigned int i=1; i<=outgoing_limit; i++,cur++)
|
|
{
|
|
if (cur->mode == Node_Unused || cur->parent != lastidx)
|
|
{
|
|
continue;
|
|
}
|
|
outgoing_list[outgoing_count++] = i;
|
|
}
|
|
outgoing_list[outgoing_count++] = curidx - outgoing_base;
|
|
|
|
/* Now we need to find all the arcs leaving our parent...
|
|
* Note: the inconsistency is the base of our parent.
|
|
*/
|
|
assert(m_base[node->parent].mode == Node_Arc);
|
|
unsigned int incoming_list[256];
|
|
unsigned int incoming_base = m_base[node->parent].idx;
|
|
unsigned int incoming_count = 0;
|
|
unsigned int incoming_limit = 255;
|
|
cur = &m_base[incoming_base] + 1;
|
|
|
|
if (incoming_base + incoming_limit > m_baseSize)
|
|
{
|
|
incoming_limit = m_baseSize - incoming_base;
|
|
}
|
|
|
|
assert(incoming_limit > 0 && incoming_limit <= 255);
|
|
|
|
for (unsigned int i=1; i<=incoming_limit; i++,cur++)
|
|
{
|
|
if (cur->mode == Node_Arc || cur->mode == Node_Term)
|
|
{
|
|
if (cur->parent == node->parent)
|
|
{
|
|
incoming_list[incoming_count++] = i;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (incoming_count < outgoing_count + 1)
|
|
{
|
|
unsigned int q = x_check_multi(incoming_list, incoming_count);
|
|
|
|
node = &m_base[curidx];
|
|
|
|
/* If we're incoming, we need to modify our parent */
|
|
m_base[node->parent].idx = q;
|
|
|
|
/* For each node in the "to move" list,
|
|
* Relocate the node's info to the new position.
|
|
*/
|
|
unsigned int idx, newidx, oldidx;
|
|
for (unsigned int i=0; i<incoming_count; i++)
|
|
{
|
|
idx = incoming_list[i];
|
|
newidx = q + idx;
|
|
oldidx = incoming_base + idx;
|
|
if (oldidx == lastidx)
|
|
{
|
|
/* Important! Make sure we're not invalidating our sacred lastidx */
|
|
lastidx = newidx;
|
|
}
|
|
/* Fully copy the node */
|
|
memcpy(&m_base[newidx], &m_base[oldidx], sizeof(KTrieNode));
|
|
if (m_base[oldidx].valset)
|
|
{
|
|
new (&m_base[newidx].value) K(m_base[oldidx].value);
|
|
m_base[oldidx].value.~K();
|
|
}
|
|
assert(m_base[m_base[newidx].parent].mode == Node_Arc);
|
|
/* Erase old data */
|
|
memset(&m_base[oldidx], 0, sizeof(KTrieNode));
|
|
/* If we are not a terminator, we have children we must take care of */
|
|
if (m_base[newidx].mode == Node_Arc)
|
|
{
|
|
KTrieNode *check_base = &m_base[m_base[newidx].idx] + 1;
|
|
outgoing_limit = (m_base + m_baseSize + 1) - check_base;
|
|
if (outgoing_limit > 255)
|
|
{
|
|
outgoing_limit = 255;
|
|
}
|
|
for (unsigned int j=1; j<=outgoing_limit; j++, check_base++)
|
|
{
|
|
if (check_base->parent == oldidx)
|
|
{
|
|
check_base->parent = newidx;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
unsigned int q = x_check_multi(outgoing_list, outgoing_count);
|
|
|
|
node = &m_base[curidx];
|
|
|
|
/* If we're outgoing, we need to modify our own base */
|
|
m_base[lastidx].idx = q;
|
|
|
|
/* Take the last index (curidx) out of the list. Technically we are not moving this,
|
|
* since it's already being used by something else.
|
|
*/
|
|
outgoing_count--;
|
|
|
|
/* For each node in the "to move" list,
|
|
* Relocate the node's info to the new position.
|
|
*/
|
|
unsigned int idx, newidx, oldidx;
|
|
for (unsigned int i=0; i<outgoing_count; i++)
|
|
{
|
|
idx = outgoing_list[i];
|
|
newidx = q + idx;
|
|
oldidx = outgoing_base + idx;
|
|
if (oldidx == lastidx)
|
|
{
|
|
/* Important! Make sure we're not invalidating our sacred lastidx */
|
|
lastidx = newidx;
|
|
}
|
|
/* Fully copy the node */
|
|
memcpy(&m_base[newidx], &m_base[oldidx], sizeof(KTrieNode));
|
|
if (m_base[oldidx].valset)
|
|
{
|
|
new (&m_base[newidx].value) K(m_base[oldidx].value);
|
|
m_base[oldidx].value.~K();
|
|
}
|
|
assert(m_base[m_base[newidx].parent].mode == Node_Arc);
|
|
/* Erase old data */
|
|
memset(&m_base[oldidx], 0, sizeof(KTrieNode));
|
|
/* If we are not a terminator, we have children we must take care of */
|
|
if (m_base[newidx].mode == Node_Arc)
|
|
{
|
|
KTrieNode *check_base = &m_base[m_base[newidx].idx] + 1;
|
|
outgoing_limit = (m_base + m_baseSize + 1) - check_base;
|
|
if (outgoing_limit > 255)
|
|
{
|
|
outgoing_limit = 255;
|
|
}
|
|
for (unsigned int j=1; j<=outgoing_limit; j++, check_base++)
|
|
{
|
|
if (check_base->parent == oldidx)
|
|
{
|
|
check_base->parent = newidx;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Take the invisible node and use it as our new node */
|
|
node = &m_base[q + outgoing_list[outgoing_count]];
|
|
}
|
|
|
|
/* We're finally done! */
|
|
node->parent = lastidx;
|
|
if (*keyptr == '\0')
|
|
{
|
|
node->mode = Node_Arc;
|
|
}
|
|
else
|
|
{
|
|
node->idx = x_addstring(keyptr);
|
|
node->mode = Node_Term;
|
|
}
|
|
node->valset = true;
|
|
new (&node->value) K(obj);
|
|
|
|
m_numElements++;
|
|
|
|
return true;
|
|
}
|
|
else
|
|
{
|
|
/* See what's in the next node - special case if terminator! */
|
|
if (node->mode == Node_Term)
|
|
{
|
|
/* If we're a terminator, we need to handle CASE 3:
|
|
* Insertion when a terminating collision occurs
|
|
*/
|
|
char *term = &m_stringtab[node->idx];
|
|
/* Do an initial browsing to make sure they're not the same string */
|
|
if (strcmp(keyptr, term) == 0)
|
|
{
|
|
if (!node->valset)
|
|
{
|
|
node->valset = true;
|
|
new (&node->value) K(obj);
|
|
m_numElements++;
|
|
return true;
|
|
}
|
|
/* Same string. We can't insert. */
|
|
return false;
|
|
}
|
|
/* For each matching character pair, we need to disband the terminator.
|
|
* This splits the similar prefix into a single arc path.
|
|
* First, save the old values so we can move them to a new node.
|
|
* Next, for each loop:
|
|
* Take the current (invalid) node, and point it to the next arc base.
|
|
* Set the current node to the node at the next arc.
|
|
*/
|
|
K oldvalue;
|
|
bool oldvalset = node->valset;
|
|
if (oldvalset)
|
|
{
|
|
oldvalue = node->value;
|
|
}
|
|
if (*term == *keyptr)
|
|
{
|
|
while (*term == *keyptr)
|
|
{
|
|
/* Find the next free slot in the check array.
|
|
* This is the "vector base" essentially
|
|
*/
|
|
q = x_check(*term);
|
|
node = &m_base[curidx];
|
|
/* Point the node to the next new base */
|
|
node->idx = q;
|
|
node->mode = Node_Arc;
|
|
if (node->valset == true)
|
|
{
|
|
node->value.~K();
|
|
node->valset = false;
|
|
}
|
|
/* Advance the input stream and local variables */
|
|
lastidx = curidx;
|
|
curidx = q + charval(*term);
|
|
node = &m_base[curidx];
|
|
/* Make sure the new current node has its parent set. */
|
|
node->parent = lastidx;
|
|
node->mode = Node_Arc; /* Just in case we run x_check again */
|
|
*term = '\0'; /* Unmark the string table here */
|
|
term++;
|
|
keyptr++;
|
|
}
|
|
}
|
|
else if (node->valset)
|
|
{
|
|
node->valset = false;
|
|
node->value.~K();
|
|
}
|
|
/* We're done inserting new pairs. If one of them is exhausted,
|
|
* we take special shortcuts.
|
|
*/
|
|
if (*term == '\0') //EX: BADGERHOUSE added over B -> ADGER.
|
|
{
|
|
/* First backpatch the current node - it ends the newly split terminator.
|
|
* In the example, this would mean the node is the production from R -> ?
|
|
* This node ends the old BADGER, so we set it here.
|
|
*/
|
|
node->valset = oldvalset;
|
|
if (node->valset)
|
|
{
|
|
new (&node->value) K(oldvalue);
|
|
}
|
|
|
|
/* The terminator was split up, but pieces of keyptr remain.
|
|
* We need to generate a new production, in this example, R -> H,
|
|
* with H being a terminator to OUSE. Thus we get:
|
|
* B,A,D,G,E,R*,H*->OUSE (* = value set).
|
|
* NOTE: parent was last set at the end of the while loop.
|
|
*/
|
|
/* Get the new base and apply re-basing */
|
|
q = x_check(*keyptr);
|
|
node = &m_base[curidx];
|
|
|
|
node->idx = q;
|
|
node->mode = Node_Arc;
|
|
lastidx = curidx;
|
|
/* Finish the final node */
|
|
curidx = q + charval(*keyptr);
|
|
node = &m_base[curidx];
|
|
keyptr++;
|
|
/* Optimize - don't add to string table if there's nothing more to eat */
|
|
if (*keyptr == '\0')
|
|
{
|
|
node->mode = Node_Arc;
|
|
}
|
|
else
|
|
{
|
|
node->idx = x_addstring(keyptr);
|
|
node->mode = Node_Term;
|
|
}
|
|
node->parent = lastidx;
|
|
node->valset = true;
|
|
new (&node->value) K(obj);
|
|
}
|
|
else if (*keyptr == '\0')
|
|
{ //EX: BADGER added over B -> ADGERHOUSE
|
|
/* First backpatch the current node - it ends newly split input string.
|
|
* This is the exact opposite of the above procedure.
|
|
*/
|
|
node->valset = true;
|
|
new (&node->value) K(obj);
|
|
|
|
/* Get the new base and apply re-basing */
|
|
q = x_check(*term);
|
|
node = &m_base[curidx];
|
|
|
|
node->idx = q;
|
|
node->mode = Node_Arc;
|
|
lastidx = curidx;
|
|
/* Finish the final node */
|
|
curidx = q + charval(*term);
|
|
node = &m_base[curidx];
|
|
term++;
|
|
/* Optimize - don't add to string table if there's nothing more to eat */
|
|
if (*term == '\0')
|
|
{
|
|
node->mode = Node_Arc;
|
|
}
|
|
else
|
|
{
|
|
node->idx = (term - m_stringtab); /* Already in the string table! */
|
|
node->mode = Node_Term;
|
|
}
|
|
node->parent = lastidx;
|
|
node->valset = oldvalset;
|
|
if (node->valset)
|
|
{
|
|
new (&node->value) K(oldvalue);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
/* Finally, we have to create two new nodes instead of just one. */
|
|
node->mode = Node_Arc;
|
|
|
|
/* Get the new base and apply re-basing */
|
|
q = x_check2(*keyptr, *term);
|
|
node = &m_base[curidx];
|
|
|
|
node->idx = q;
|
|
lastidx = curidx;
|
|
|
|
/* Re-create the old terminated node */
|
|
curidx = q + charval(*term);
|
|
node = &m_base[curidx];
|
|
term++;
|
|
node->valset = oldvalset;
|
|
if (node->valset)
|
|
{
|
|
new (&node->value) K(oldvalue);
|
|
}
|
|
node->parent = lastidx;
|
|
if (*term == '\0')
|
|
{
|
|
node->mode = Node_Arc;
|
|
}
|
|
else
|
|
{
|
|
node->mode = Node_Term;
|
|
node->idx = (term - m_stringtab); /* Already in the string table! */
|
|
}
|
|
|
|
/* Create the new keyed input node */
|
|
curidx = q + charval(*keyptr);
|
|
node = &m_base[curidx];
|
|
keyptr++;
|
|
node->valset = true;
|
|
new (&node->value) K(obj);
|
|
node->parent = lastidx;
|
|
if (*keyptr == '\0')
|
|
{
|
|
node->mode = Node_Arc;
|
|
}
|
|
else
|
|
{
|
|
node->mode = Node_Term;
|
|
node->idx = x_addstring(keyptr);
|
|
}
|
|
}
|
|
|
|
m_numElements++;
|
|
|
|
/* Phew! */
|
|
return true;
|
|
}
|
|
else
|
|
{
|
|
assert(node->mode == Node_Arc);
|
|
}
|
|
}
|
|
lastidx = curidx;
|
|
} while (*keyptr != '\0');
|
|
|
|
assert(node);
|
|
|
|
/* If we've exhausted the string and we have a valid reached node,
|
|
* the production rule already existed. Make sure it's valid to set first.
|
|
*/
|
|
|
|
/* We have to be an Arc. If the last result was anything else, we would have returned a new
|
|
* production earlier.
|
|
*/
|
|
assert(node->mode == Node_Arc);
|
|
|
|
if (!node->valset)
|
|
{
|
|
node->valset = true;
|
|
new (&node->value) K(obj);
|
|
m_numElements++;
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* @brief Iterates over the trie returning all known values.
|
|
*
|
|
* Note: This function is for debugging. Do not use it as a
|
|
* production iterator since it's inefficient. Iteration is
|
|
* guaranteed to be sorted ascendingly.
|
|
*
|
|
* The callback function takes:
|
|
* (KTrie) - Pointer to this Trie
|
|
* (const char *) - String containing key name.
|
|
* (K &) - By-reference object at the key.
|
|
* (data) - User pointer.
|
|
*
|
|
* @param buffer Buffer to use as a key name cache.
|
|
* @param maxlength Maximum length of the key name buffer.
|
|
* @param data User pointer for passing to the iterator.
|
|
* @param func Iterator callback function.
|
|
*/
|
|
void bad_iterator(char *buffer,
|
|
size_t maxlength,
|
|
void *data,
|
|
void (*func)(KTrie *, const char *, K & obj, void *data))
|
|
{
|
|
bad_iterator_r(buffer, maxlength, 0, data, func, 1);
|
|
}
|
|
|
|
private:
|
|
void bad_iterator_r(char *buffer,
|
|
size_t maxlength,
|
|
size_t buf_pos,
|
|
void *data,
|
|
void (*func)(KTrie *, const char *, K & obj, void *data),
|
|
unsigned int root)
|
|
{
|
|
char *term;
|
|
unsigned int idx, limit, start;
|
|
|
|
limit = 255;
|
|
start = m_base[root].idx;
|
|
|
|
/* Bound our limits */
|
|
if (start + limit > m_baseSize)
|
|
{
|
|
limit = m_baseSize - start;
|
|
}
|
|
|
|
/* Search for strings */
|
|
for (unsigned int i = 1; i <= limit; i++)
|
|
{
|
|
idx = start + i;
|
|
if (m_base[idx].mode == Node_Unused
|
|
|| m_base[idx].parent != root)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
if (m_base[idx].mode == Node_Arc)
|
|
{
|
|
if (buf_pos < maxlength - 1)
|
|
{
|
|
buffer[buf_pos++] = (char)i;
|
|
}
|
|
|
|
if (m_base[idx].valset)
|
|
{
|
|
buffer[buf_pos] = '\0';
|
|
func(this, buffer, m_base[idx].value, data);
|
|
}
|
|
|
|
bad_iterator_r(buffer,
|
|
maxlength,
|
|
buf_pos,
|
|
data,
|
|
func,
|
|
idx);
|
|
|
|
buf_pos--;
|
|
}
|
|
else if (m_base[idx].mode == Node_Term
|
|
&& m_base[idx].valset == true)
|
|
{
|
|
size_t save_buf_pos;
|
|
|
|
save_buf_pos = buf_pos;
|
|
if (buf_pos < maxlength - 1)
|
|
{
|
|
buffer[buf_pos++] = (char)i;
|
|
}
|
|
if (buf_pos < maxlength - 1)
|
|
{
|
|
size_t destlen, j;
|
|
|
|
term = &m_stringtab[m_base[idx].idx];
|
|
destlen = strlen(term);
|
|
for (j = 0;
|
|
j < destlen && j + buf_pos < maxlength - 1;
|
|
j++)
|
|
{
|
|
buffer[buf_pos + j] = term[j];
|
|
}
|
|
buf_pos += j;
|
|
}
|
|
buffer[buf_pos] = '\0';
|
|
|
|
func(this, buffer, m_base[idx].value, data);
|
|
|
|
buf_pos = save_buf_pos;
|
|
}
|
|
}
|
|
}
|
|
public:
|
|
KTrie()
|
|
{
|
|
m_base = (KTrieNode *)malloc(sizeof(KTrieNode) * (256 + 1));
|
|
m_stringtab = (char *)malloc(sizeof(char) * 256);
|
|
m_baseSize = 256;
|
|
m_stSize = 256;
|
|
m_empty = NULL;
|
|
m_numElements = 0;
|
|
|
|
internal_clear();
|
|
}
|
|
~KTrie()
|
|
{
|
|
if (m_empty != NULL && m_empty->valset)
|
|
{
|
|
m_empty->value.~K();
|
|
m_empty->valset = false;
|
|
}
|
|
free(m_empty);
|
|
run_destructors();
|
|
free(m_base);
|
|
free(m_stringtab);
|
|
}
|
|
void run_destructor(void (*dtor)(K * ptr))
|
|
{
|
|
for (size_t i = 0; i <= m_baseSize; i++)
|
|
{
|
|
if (m_base[i].valset)
|
|
{
|
|
dtor(&m_base[i].value);
|
|
m_base[i].valset = false;
|
|
}
|
|
}
|
|
}
|
|
private:
|
|
class KTrieNode
|
|
{
|
|
friend class KTrie;
|
|
private:
|
|
/**
|
|
* For Node_Arc, this index stores the 'base' offset to the next arc chain.
|
|
* I.e. to jump from this arc to character C, it will be at base[idx+C].
|
|
* For Node_Term, this is an index into the string table.
|
|
*/
|
|
unsigned int idx;
|
|
|
|
/**
|
|
* This contains the prior arc that we must have come from.
|
|
* For example, if arc 63 has a base jump of index 12, and we want to see if
|
|
* there is a valid character C, the parent of 12+C must be 63.
|
|
*/
|
|
unsigned int parent;
|
|
K value; /* Value associated with this node */
|
|
NodeType mode; /* Current usage type of the node */
|
|
bool valset; /* Whether or not a value is set */
|
|
};
|
|
private:
|
|
KTrieNode *internal_retrieve(const char *key)
|
|
{
|
|
unsigned int lastidx = 1; /* the last node index */
|
|
unsigned int curidx; /* current node index */
|
|
const char *keyptr = key; /* input stream at current token */
|
|
KTrieNode *node = NULL; /* current node being processed */
|
|
|
|
if (!*key)
|
|
{
|
|
return m_empty;
|
|
}
|
|
|
|
/* Start traversing at the root node */
|
|
do
|
|
{
|
|
/* Find where the next character is, then advance */
|
|
curidx = m_base[lastidx].idx;
|
|
node = &m_base[curidx];
|
|
curidx += charval(*keyptr);
|
|
node = &m_base[curidx];
|
|
keyptr++;
|
|
|
|
/* Check if this slot is supposed to be empty or is a collision */
|
|
if ((curidx > m_baseSize) || node->mode == Node_Unused || node->parent != lastidx)
|
|
{
|
|
return NULL;
|
|
}
|
|
else if (node->mode == Node_Term)
|
|
{
|
|
char *term = &m_stringtab[node->idx];
|
|
if (strcmp(keyptr, term) == 0)
|
|
{
|
|
break;
|
|
}
|
|
else
|
|
{
|
|
return NULL;
|
|
}
|
|
}
|
|
lastidx = curidx;
|
|
} while (*keyptr != '\0');
|
|
|
|
return node;
|
|
}
|
|
bool grow()
|
|
{
|
|
/* The current # of nodes in the tree is trie->baseSize + 1 */
|
|
unsigned int cur_size = m_baseSize;
|
|
unsigned int new_size = cur_size * 2;
|
|
|
|
KTrieNode *new_base = (KTrieNode *)malloc((new_size + 1) * sizeof(KTrieNode));
|
|
if (!new_base)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
memcpy(new_base, m_base, sizeof(KTrieNode) * (m_baseSize + 1));
|
|
memset(&new_base[cur_size + 1], 0, (new_size - cur_size) * sizeof(KTrieNode));
|
|
|
|
for (size_t i = 0; i <= m_baseSize; i++)
|
|
{
|
|
if (m_base[i].valset)
|
|
{
|
|
/* Placement construct+copy the object, then placement destroy the old. */
|
|
new (&new_base[i].value) K(m_base[i].value);
|
|
m_base[i].value.~K();
|
|
}
|
|
}
|
|
|
|
free(m_base);
|
|
m_base = new_base;
|
|
m_baseSize = new_size;
|
|
|
|
return true;
|
|
}
|
|
inline unsigned char charval(char c)
|
|
{
|
|
return (unsigned char)c;
|
|
}
|
|
void internal_clear()
|
|
{
|
|
m_tail = 0;
|
|
m_numElements = 0;
|
|
|
|
memset(m_base, 0, sizeof(KTrieNode) * (m_baseSize + 1));
|
|
memset(m_stringtab, 0, sizeof(char) * m_stSize);
|
|
|
|
/* Sentinel root node */
|
|
m_base[1].idx = 1;
|
|
m_base[1].mode = Node_Arc;
|
|
m_base[1].parent = 1;
|
|
}
|
|
void run_destructors()
|
|
{
|
|
for (size_t i = 0; i <= m_baseSize; i++)
|
|
{
|
|
if (m_base[i].valset)
|
|
{
|
|
m_base[i].value.~K();
|
|
}
|
|
}
|
|
}
|
|
unsigned int x_addstring(const char *ptr)
|
|
{
|
|
size_t len = strlen(ptr) + 1;
|
|
|
|
if (m_tail + len >= m_stSize)
|
|
{
|
|
while (m_tail + len >= m_stSize)
|
|
{
|
|
m_stSize *= 2;
|
|
}
|
|
m_stringtab = (char *)realloc(m_stringtab,m_stSize);
|
|
}
|
|
|
|
unsigned int tail = m_tail;
|
|
strcpy(&m_stringtab[tail], ptr);
|
|
m_tail += len;
|
|
|
|
return tail;
|
|
}
|
|
unsigned int x_check(char c, unsigned int start=1)
|
|
{
|
|
unsigned char _c = charval(c);
|
|
unsigned int to_check = m_baseSize - _c;
|
|
for (unsigned int i=start; i<=to_check; i++)
|
|
{
|
|
if (m_base[i+_c].mode == Node_Unused)
|
|
{
|
|
return i;
|
|
}
|
|
}
|
|
|
|
grow();
|
|
|
|
return x_check(c, to_check+1);
|
|
}
|
|
unsigned int x_check2(char c1, char c2, unsigned int start=1)
|
|
{
|
|
unsigned char _c1 = charval(c1);
|
|
unsigned char _c2 = charval(c2);
|
|
unsigned int to_check = m_baseSize - (_c1 > _c2 ? _c1 : _c2);
|
|
for (unsigned int i=start; i<=to_check; i++)
|
|
{
|
|
if (m_base[i+_c1].mode == Node_Unused
|
|
&& m_base[i+_c2].mode == Node_Unused)
|
|
{
|
|
return i;
|
|
}
|
|
}
|
|
|
|
grow();
|
|
|
|
return x_check2(c1, c2, to_check+1);
|
|
}
|
|
unsigned int x_check_multi(
|
|
unsigned int offsets[],
|
|
unsigned int count,
|
|
unsigned int start=1)
|
|
{
|
|
KTrieNode *cur;
|
|
unsigned int to_check = m_baseSize;
|
|
unsigned int highest = 0;
|
|
|
|
for (unsigned int i=0; i<count; i++)
|
|
{
|
|
if (offsets[i] > highest)
|
|
{
|
|
highest = offsets[i];
|
|
}
|
|
}
|
|
|
|
to_check -= highest;
|
|
|
|
for (unsigned int i=start; i<=to_check; i++)
|
|
{
|
|
bool okay = true;
|
|
for (unsigned int j=0; j<count; j++)
|
|
{
|
|
cur = &m_base[i+offsets[j]];
|
|
if (cur->mode != Node_Unused)
|
|
{
|
|
okay = false;
|
|
break;
|
|
}
|
|
}
|
|
if (okay)
|
|
{
|
|
return i;
|
|
}
|
|
}
|
|
|
|
grow();
|
|
|
|
return x_check_multi(offsets, count, to_check+1);
|
|
}
|
|
public:
|
|
size_t mem_usage()
|
|
{
|
|
return (sizeof(KTrieNode) * (m_baseSize))
|
|
+ m_stSize
|
|
+ sizeof(KTrieNode);
|
|
}
|
|
size_t size()
|
|
{
|
|
return m_numElements;
|
|
}
|
|
private:
|
|
KTrieNode *m_base; /* Base array for the sparse tables */
|
|
KTrieNode *m_empty; /* Special case for empty strings */
|
|
char *m_stringtab; /* String table pointer */
|
|
unsigned int m_baseSize; /* Size of the base array, in members */
|
|
unsigned int m_stSize; /* Size of the string table, in bytes */
|
|
unsigned int m_tail; /* Current unused offset into the string table */
|
|
size_t m_numElements; /* Number of elements in use */
|
|
};
|
|
|
|
/**
|
|
* Double Array Trie algorithm, based on:
|
|
* An Efficient Implementation of Trie Structures, by
|
|
* Jun-ichi Aoe and Katsushi Maromoto, and Takashi Sato
|
|
* from Software - Practice and Experience, Vol. 22(9), 695-721 (September 1992)
|
|
*
|
|
* A Trie is a simple data structure which stores strings as DFAs, with each
|
|
* transition state being a string entry. For example, observe the following strings:
|
|
*
|
|
* BAILOPAN, BAT, BACON, BACK
|
|
* These transition as the follow production rules:
|
|
* B -> ... B
|
|
* A -> ... BA
|
|
* I -> ... BAI
|
|
* LOPAN BAILOPAN
|
|
* T -> ... BAT
|
|
* C -> BAC
|
|
* O -> ... BACO
|
|
* N BACON
|
|
* K BACK
|
|
*
|
|
* The standard implementation for this - using lists - gives a slow linear lookup, somewhere between
|
|
* O(N+M) or O(log n). A faster implementation is proposed in the paper above, which is based on compacting
|
|
* the transition states into two arrays. In the paper's implementation, two arrays are used, and thus it is
|
|
* called the "Double Array" algorithm. However, the CHECK array's size is maintained the same as BASE,
|
|
* so they can be combined into one structure. The array seems complex at first, but is very simple: it is a
|
|
* tree structure flattened out into a single vector. I am calling this implementation the Flat Array Trie.
|
|
*
|
|
* BASE[] is an array where each member is a node in the Trie. The node can either be UNUSED (empty), an ARC
|
|
* (containing an offset to the next set of ARCs), or a TERMINATOR (contains the rest of a string).
|
|
* Each node has an index which must be interpreted based on the node type. If the node is a TERMINATOR, then the
|
|
* index is an index into a string table, to find the rest of the string.
|
|
* If the node is an ARC, the index is another index into BASE. For each possible token that can follow the
|
|
* current token, the value of those tokens can be added to the index given in the ARC. Thus, given a current
|
|
* position and the next desired token, the current arc will jump to another arc which can contain either:
|
|
* 1) An invalid production (collision, no entry exists)
|
|
* 2) An empty production (no entry exists)
|
|
* 3) Another arc label (the string ends here or continues into more productions)
|
|
* 4) A TERMINATOR (the string ends here and contains an unused set of productions)
|
|
*
|
|
* So, given current offset N (starting at N=1), jumping to token C means the next offset will be:
|
|
* offs = BASE[n] + C
|
|
* Thus, the next node will be at:
|
|
* BASE[BASE[n] + C]
|
|
*
|
|
* This allows each ARC to specify the base offset for any of its ARC children, like a tree. Each node specifies
|
|
* its parent ARC -- so if an invalid offset is specified, the parent will not match, and thus no such derived
|
|
* string exists.
|
|
*
|
|
* This means that arrays can be laid out "sparsely," maximizing their usage. Note that N need not be related to
|
|
* the range of tokens (1-256). I.e., a base index does not have to be at 1, 256, 512, et cetera. This is because
|
|
* insertion comes with a small deal of complexity. To insert a new set of tokens T, the algorithm finds a new
|
|
* BASE index N such that BASE[N+T[i]] is unused for each T[i]. Thus, indirection is not necessarily linear;
|
|
* traversing a chain of ARC nodes can _and will_ jump around BASE.
|
|
*
|
|
* Of course, given this level of flexibility in the array organization, there are collisions. This is largely
|
|
* where insertions become slow, as the old chain must be relocated before the new one is used. Relocation means
|
|
* finding one or more new base indexes, and this means traversing BASE until an acceptable index is found, such
|
|
* that each offset is unused (see description in previous paragraph).
|
|
*
|
|
* However, it is not insertion time we are concerned about. The "trie" name comes from reTRIEval. We are only
|
|
* concerned with lookup and deletion. Both lookup and deletion are O(k), where k is relative to the length of the
|
|
* input string. Note that it is best case O(1) and worst case O(k). Deleting the entire trie is always O(1).
|
|
*/
|
|
|
|
#endif //_INCLUDE_SOURCEMOD_TEMPLATED_TRIE_H_
|