OuSob - File: /wwwroot/clipx/usr/include/htdig/WordList.h
//
// WordList.h
//
// NAME
//
// manage and use an inverted index file.
//
// SYNOPSIS
//
// #include
//
// Configuration* config;
// WordReference wordRef;
// ...
// WordList* words = new WordList(config)
//
// delete words;
//
// DESCRIPTION
//
// WordList is the mifluz equivalent of a database handler. Each
// WordList object is bound to an inverted index file and implements the
// operations to create it, fill it with word occurrences and search
// for an entry matching a given criterion.
//
// CONFIGURATION
//
// wordlist_extend {true|false} (default false)
// If true maintain reference count of unique
// words. The Noccurrence method gives access to this count.
//
// wordlist_verbose (default 0)
// Set the verbosity level of the WordList class.
//
// 1 walk logic
//
// 2 walk logic details
//
// 3 walk logic lots of details
//
// wordlist_page_size (default 8192)
// Berkeley DB page size (see Berkeley DB documentation)
//
// wordlist_cache_size (default 500K)
// Berkeley DB cache size (see Berkeley DB documentation)
// Cache makes a huge difference in performance. It must be at least 2%
// of the expected total data size. Note that if compression is activated
// the data size is eight times larger than the actual file size. In this
// case the cache must be scaled to 2% of the data size, not 2%
// of the file size. See Cache tuning in the mifluz guide for
// more hints.
//
// wordlist_compress {true|false} (default false)
// Activate compression of the index. The resulting index is eight times
// smaller than the uncompressed index.
//
//
// END
//
// Part of the ht://Dig package
// Copyright (c) 1999-2004 The ht://Dig Group
// For copyright details, see the file COPYING in your distribution
// or the GNU Library General Public License (LGPL) version 2 or later
//
//
// $Id: WordList.h,v 1.10 2004/05/28 13:15:28 lha Exp $
//
#ifndef _WordList_h_
#define _WordList_h_
#include
#include
#ifndef SWIG
#include "Dictionary.h"
#include "List.h"
#include "htString.h"
#include "WordRecord.h"
#include "WordReference.h"
#include "WordType.h"
#include "WordDB.h"
#include "WordDBCompress.h"
#include "Configuration.h"
#include "WordCursor.h"
#endif /* SWIG */
class List;
class WordList;
class WordDBCursor;
//
// Inverted index interface
//
class WordList
{
public:
//-
// Constructor. Build inverted index handling object using
// run time configuration parameters listed in the CONFIGURATION
// section.
//
WordList(const Configuration& config_arg);
virtual ~WordList();
//-
// Insert wordRef in index. It is an error to insert
// the same wordRef twice. This requires a lookup in the index
// prior to the insertion.
// Returns OK on success, NOTOK on error.
//
int Insert(const WordReference& wordRef) { return Put(wordRef, DB_NOOVERWRITE); }
//-
// Insert wordRef in index. If the Key() part of
// the wordRef exists in the index, override it.
// Returns OK on success, NOTOK on error.
//
int Override(const WordReference& wordRef) { return Put(wordRef, 0); }
#ifndef SWIG
int Put(const WordReference& wordRef, int flags);
#endif /* SWIG */
//-
// Returns OK if wordRef exists in the index, NOTOK otherwise.
//
int Exists(const WordReference& wordRef) { return db.Exists(wordRef) == 0 ? OK : NOTOK; }
#ifndef SWIG
//-
// Returns OK if word exists in the index, NOTOK otherwise.
//
int Exists(const String& word) { return Exists(WordReference(word)); }
#endif /* SWIG */
//
// Delete permanently
//
//-
// Delete all entries in the index whose key matches the
// Key() part of wordRef, using the Walk
// method.
// Returns the number of entries successfully deleted.
//
int WalkDelete(const WordReference& wordRef);
//-
// Delete the entry in the index that exactly matches the
// Key() part of wordRef.
// Returns OK if deletion is successfull, NOTOK otherwise.
//
int Delete(const WordReference& wordRef) {
if(db.Del(wordRef) == 0)
return Unref(wordRef);
else
return NOTOK;
}
#ifdef SWIG
%name(DeleteCursor)
#endif /* SWIG */
//-
// Delete the inverted index entry currently pointed to by the
// cursor.
// Returns 0 on success, Berkeley DB error code on error. This
// is mainly useful when implementing a callback function for
// a WordCursor.
//
int Delete(WordDBCursor& cursor) { return cursor.Del(); }
//-
// Open inverted index filename. mode
// may be O_RDONLY or O_RDWR. If mode is
// O_RDWR it can be or'ed with O_TRUNC to reset
// the content of an existing inverted index.
// If word_only is true, entries will compare equal if the "word" part
// of the key is equal, even if the numeric fields aren't. (What are the
// numeric fields, anyway??)
// Return OK on success, NOTOK otherwise.
//
int Open(const String& filename, int mode, int word_only=false);
//-
// Close inverted index.
//
int Close();
//
// These returns a list of all the WordReference * matching
// the constraint.
//-
// Returns the list of word occurrences exactly matching the
// Key() part of wordRef. The List returned
// contains pointers to WordReference objects. It is
// the responsibility of the caller to free the list. See List.h
// header for usage.
//
List *Find(const WordReference& wordRef) { return (*this)[wordRef]; }
//-
// Returns the list of word occurrences exactly matching the
// word. The List returned
// contains pointers to WordReference objects. It is
// the responsibility of the caller to free the list. See List.h
// header for usage.
//
List *FindWord(const String& word) { return (*this)[word]; }
#ifndef SWIG
//-
// Alias to the Find method.
//
List *operator [] (const WordReference& wordRef);
//-
// Alias to the FindWord method.
//
List *operator [] (const String& word) { return (*this)[WordReference(word)]; }
#endif /* SWIG */
//-
// Returns the list of word occurrences matching the Key()
// part of wordRef. In the Key(), the string
// (accessed with GetWord()) matches any string that begins
// with it. The List returned contains pointers to
// WordReference objects. It is the responsibility of the
// caller to free the list.
//
List *Prefix (const WordReference& prefix);
#ifndef SWIG
//-
// Returns the list of word occurrences matching the
// word. In the Key(), the string (accessed with
// GetWord()) matches any string that begins with it. The
// List returned contains pointers to WordReference
// objects. It is the responsibility of the caller to free the
// list.
//
List *Prefix (const String& prefix) { return this->Prefix(WordReference(prefix)); }
#endif /* SWIG */
//
// Iterate over the complete database.
//
#ifndef SWIG
//-
// Returns a list of all unique words contained in the inverted
// index. The List returned contains pointers to
// String objects. It is the responsibility of the caller
// to free the list. See List.h header for usage.
//
List *Words();
#endif /* SWIG */
//-
// Returns a list of all entries contained in the
// inverted index. The List returned contains pointers to
// WordReference objects. It is the responsibility of
// the caller to free the list. See List.h header for usage.
//
List *WordRefs();
#ifndef SWIG
//-
// Create a cursor that searches all the occurrences in the
// inverted index and call ncallback with
// ncallback_data for every match.
//
WordCursor *Cursor(wordlist_walk_callback_t callback, Object *callback_data) { return new WordCursor(this, callback, callback_data); }
#endif /* SWIG */
//-
// Create a cursor that searches all the occurrences in the
// inverted index and that match nsearchKey. If
// naction is set to HTDIG_WORDLIST_WALKER calls
// searchKey.callback with searchKey.callback_data
// for every match. If naction is set to
// HTDIG_WORDLIST_COLLECT push each match in searchKey.collectRes
// data member as a WordReference object. It is the responsibility
// of the caller to free the searchKey.collectRes list.
//
WordCursor *Cursor(const WordKey &searchKey, int action = HTDIG_WORDLIST_WALKER) { return new WordCursor(this, searchKey, action); }
#ifndef SWIG
//-
// Create a cursor that searches all the occurrences in the
// inverted index and that match nsearchKey and calls
// ncallback with ncallback_data for every match.
//
WordCursor *Cursor(const WordKey &searchKey, wordlist_walk_callback_t callback, Object * callback_data) { return new WordCursor(this, searchKey, callback, callback_data); }
#endif /* SWIG */
//
// Update/get global word statistics statistics
//
//-
// Add one to the reference count for the string contained
// in the Key().GetWord() part of wordRef.
// Returns OK on success, NOTOK otherwise.
//
int Ref(const WordReference& wordRef);
//-
// Substract one to the reference count for the string contained
// in the Key().GetWord() part of wordRef.
// Returns OK on success, NOTOK otherwise.
//
int Unref(const WordReference& wordRef);
#ifndef SWIG
//-
// Return in noccurrence the number of occurrences of the
// string contained in the GetWord() part of key.
// Returns OK on success, NOTOK otherwise.
//
int Noccurrence(const WordKey& key, unsigned int& noccurrence) const;
//
// Accessors
//
//
// Get the Berkeley DB object
//
const WordType& GetWordType() const { return wtype; }
#endif /* SWIG */
//-
// Return the Configuration object used to initialize
// the WordList object.
//
const Configuration& GetConfiguration() const { return config; }
#ifndef SWIG
//
// Input/Output
//
//-
// Write on file descriptor f an ASCII description of the
// index. Each line of the file contains a WordReference
// ASCII description.
// Returns 0 on success, not 0 otherwise.
//
int Write(FILE* f);
//
//-
// Read WordReference ASCII descriptions from f,
// returns the number of inserted WordReference or < 0 if an error
// occurs. Invalid descriptions are ignored as well as empty
// lines.
//
int Read(FILE* f);
#endif /* SWIG */
//
// Retrieve WordReferences from the database.
// Backend of WordRefs, operator[], Prefix...
//
List *Collect(const WordReference& word);
#ifndef SWIG
//
// Compressor object accessors
//
WordDBCompress *GetCompressor() { return compressor; }
void SetCompressor(WordDBCompress* compressor_arg) { compressor = compressor_arg; }
const WordType wtype;
const Configuration& config;
int isopen;
int isread;
//
// If true enable extended functionalities of WordList such
// as per-word statistics. Read from wordlist_extended configuration
// parameter.
//
int extended;
WordDB db;
WordDBCompress *compressor;
int verbose;
#endif /* SWIG */
};
#endif /* _WordList_h_ */