Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

IndriIndex Class Reference

#include <IndriIndex.hpp>

List of all members.

Public Methods

 IndriIndex (size_t memorySize=INDRI_DEFAULT_MEMORY_SIZE, float queryProportion=INDRI_DEFAULT_QUERY_PROPORTION)
 ~IndriIndex ()
void setName (const std::string &prefix)
 sets the name for this index

DOCID_T addDocument (struct ParsedDocument *document)
DOCID_T addDocument (const char *documentName, const greedy_vector< char * > &words, const greedy_vector< TagExtent > &tagExtents)
DocInfoListdocInfoList (TERMID_T termID) const
 doc entries in a term index,
See also:
DocList


DocPositionInfoListdocPositionInfoList (TERMID_T termID)
 doc entries in a term index with positions

indri::index::DocListFrequencyIteratordocFrequencyInfoList (TERMID_T termID)
 doc entries in a term index without positions

TermInfoListtermInfoList (DOCID_T docID) const
 word entries in a document index (bag of words),
See also:
TermList


TermInfoListtermInfoListSeq (DOCID_T docID) const
 word entries in a document index (sequence of words),
See also:
TermList


indri::index::TermListBuildertermPositionList (DOCID_T docID)
 internal IndriIndex term list representation

indri::index::FieldListIteratorfieldPositionListIterator (int fieldID)
 field list

Open index
bool open (const std::string &indexName)
 Open previously created Index with given prefix.

bool open (const char *indexName)
 Open previously created Index with given prefix.

bool openRead (const std::string &indexName)
 Open previously created Index with given prefix in read only mode.

bool create (const std::string &indexName)
 Create a new index with the given prefix.

bool create (const std::string &indexName, const std::vector< FieldDescription > &fields)
 Create a new index with the given prefix and tag set.

void close ()
 Close the index.

Spelling and index conversion
TERMID_T term (const TERM_T &word) const
 Convert a term spelling to a termID.

const TERM_T term (TERMID_T termID) const
 Convert a termID to its spelling.

DOCID_T document (const EXDOCID_T &docIDStr) const
 Convert a spelling to docID.

const EXDOCID_T document (DOCID_T docID) const
 Convert a docID to its spelling.

const char * field (int fieldID)
 Convert a fieldID to its name.

int field (const char *fieldName)
 Convert a field name to its fieldID.

int field (const std::string &fieldName)
 Convert a field name to its fieldID.

Summary counts
COUNT_T docCount () const
 Total count (i.e., number) of documents in collection.

COUNT_T termCountUnique () const
 Total count of unique terms in collection.

INT64 termCount (TERMID_T termID) const
 Total counts of a term in collection.

INT64 termCount () const
 Total counts of all terms in collection.

INT64 fieldTermCount (int fieldID, TERMID_T termID) const
 Total counts of a term in a field.

INT64 fieldTermCount (int fieldID) const
 Total counts of all terms in a field.

INT64 fieldDocCount (int fieldID) const
 Total count of documents that contain a given field.

INT64 fieldDocCount (int fieldID, TERMID_T termID) const
 Total count of documents that contain a given term in a given field.

double docLengthAvg () const
 Average document length.

COUNT_T docCount (TERMID_T termID) const
 Total counts of doc with a given term.

COUNT_T docIndexedLength (DOCID_T documentID) const
 return indexed length of the document

COUNT_T docLength (DOCID_T documentID) const
 return length of the document

int termMaxDocumentFrequency (TERMID_T termID)
 Maximum number of times this term is in any documents.

int termMinDocumentLength (TERMID_T termID)
 Minimum length of any document containing this term.

double termMaxDocumentFraction (TERMID_T termID)
 Argmax over documents of (termCount/documentLength).

int maxDocumentLength ()
 Maximum length of any document in the corpus.


Protected Methods

void _writeCache ()
void _writeAndMerge ()
void _writeBatchSegment ()
void _mergeBatch ()
void _mergeBatchSegments (int start, int end, int newNumber, bool finalMerge)
void _mergeBatchTermLists (const std::vector< int > &segmentMapping)
void _writeIncrementalSegment ()
void _mergeIncrementalSegments ()
void _readTermMapping (greedy_vector< int > &mapping, int segment, int secondSegment)
void _openMergeFiles (int startSegment, int endSegment, std::vector< File * > &listFiles, std::vector< File * > &statsFiles, std::vector< File * > &mappingFiles, std::vector< WriteBuffer * > &mappingBuffers, std::vector< ReadBuffer * > &statsBuffers, std::vector< indri::index::DocListFileIterator * > &listIterators, std::vector< char * > &terms, std::vector< indri::index::TermData * > &termDatas, bool finalMerge)
void _openDBs ()
void _openReadOnlyDBs ()
void _openSegments ()
void _createDBs ()
void _createFields (const std::vector< FieldDescription > &fieldNames)
void _closeFields ()
indri::index::DocumentData fetchDocumentData (int key) const
int fetchDocumentLength (int key) const
void _updateTermlist (TERMID_T termID, int position)
int _updateTermData (int documentLength)
size_t _cacheSize ()
void _computeMemoryBounds (size_t memorySize, float queryProportion)
void _resetEstimatePoint ()
indri::index::TermData_createTermData ()
indri::index::TermData_fetchTermData (TERMID_T termID)
indri::index::TermData_lookupTermData (TERMID_T termID)
void _cleanCache ()
void _deleteTermData (indri::index::TermData *termData)
size_t _sizeTermData ()
void _clearTermData ()
void _clearTermCache ()
void _storeTermCache (const char *term, TERMID_T termID, indri::index::TermData *&termData)
void _flushTermStatistics (TERMID_T termID, const indri::index::TermFieldStatistics &statistics)
void _addTermDataToBuilder (indri::index::DocListDiskBuilder &builder, indri::index::DocListFileIterator &iterator, int writingID, int readingID)
void _addOpenTags (greedy_vector< indri::index::FieldExtent > &indexedTags, greedy_vector< indri::index::FieldExtent > &openTags, const greedy_vector< TagExtent > &extents, unsigned int &extentIndex, unsigned int position)
void _removeClosedTags (greedy_vector< indri::index::FieldExtent > &tags, unsigned int position)
void _lookupTerm (const char *term, TERMID_T &termID, indri::index::TermData *&termData)
void _finishDocument (greedy_vector< indri::index::TermFieldStatistics * > &seenStatistics)
void _writeDocumentTermList (File::offset_type &offset, int &byteLength, DOCID_T documentID, int documentLength, indri::index::TermListBuilder &locatedTerms)
void _writeDocumentStatistics (File::offset_type offset, int byteLength, int indexedLength, int totalLength, int uniqueTerms)
void _handleCache ()
int _lookupTag (const char *tag)
bool _readTermData (TERMID_T &termID, char *termBuffer, indri::index::TermData *termData, ReadBuffer *termDataFile)
void _incrementalWriteTermData (TERMID_T termID, indri::index::TermData *termData)
void _batchWriteTermData (TERMID_T termID, indri::index::TermData *termData, WriteBuffer *file)
int _compressTermData (char *buffer, int size, indri::index::TermData *termData)
void _decompressTermData (const char *buffer, int size, indri::index::TermData *termData)
void _writeParameters (const std::string &fileName)
bool _readParameters (const std::string &fileName)
void _openDocumentFiles ()
std::string _buildFileName (const char *suffix)
std::string _buildFileName (const char *suffix, int index)

Protected Attributes

bool _readOnly
indri::index::CorpusStatistics _corpusStatistics
std::vector< indri::index::FieldData * > _fieldData
std::map< const char *, int,
less_string
_fieldLookup
std::string _baseName
bool _writingDocTermLists
 the prefix name

Keyfile _termDataStore
KeyfileWordMap _documentMap
KeyfileWordMap _termMap
File_documentStatisticsFile
File _documentLengthFile
std::vector< File * > _segments
int _batchSegmentCount
File_documentTermLocationsFile
indri::index::TermListBuilder _termList
Buffer _termListBuffer
greedy_vector< indri::index::TermData * > _seenTerms
HashTable< int, indri::index::TermData * > * _termDataTable
HashTable< const char *, term_cache_entry * > * _cache
 in memory storage of data relating to terms -- partial inverted lists and statistics

ReadBuffer_documentStatisticsBuffer
ReadBuffer_documentLengthBuffer
size_t _listsSize
size_t _memorySize
size_t _termDataSize
size_t _termCacheSize
size_t _statisticsBufferSize
size_t _lengthBufferSize
float _queryProportion
bool _batchBuild
INT64 _estimatePoint
INT64 _lastCacheFlush
 number of terms in the index when we should next check on flushing the inverted lists


Detailed Description

Indri internal index class.


Constructor & Destructor Documentation

IndriIndex::IndriIndex size_t    memorySize = INDRI_DEFAULT_MEMORY_SIZE,
float    queryProportion = INDRI_DEFAULT_QUERY_PROPORTION
 

Create.

Parameters:
memorySize  how much memory to use
queryProportion  proportion of load assumed to be queries

IndriIndex::~IndriIndex  
 


Member Function Documentation

void IndriIndex::_addOpenTags greedy_vector< indri::index::FieldExtent > &    indexedTags,
greedy_vector< indri::index::FieldExtent > &    openTags,
const greedy_vector< TagExtent > &    extents,
unsigned int &    extentIndex,
unsigned int    position
[protected]
 

void IndriIndex::_addTermDataToBuilder indri::index::DocListDiskBuilder   builder,
indri::index::DocListFileIterator   iterator,
int    writingID,
int    readingID
[protected]
 

void IndriIndex::_batchWriteTermData TERMID_T    termID,
indri::index::TermData   termData,
WriteBuffer   file
[protected]
 

std::string IndriIndex::_buildFileName const char *    suffix,
int    index
[protected]
 

std::string IndriIndex::_buildFileName const char *    suffix [protected]
 

size_t IndriIndex::_cacheSize   [protected]
 

void IndriIndex::_cleanCache   [protected]
 

void IndriIndex::_clearTermCache   [protected]
 

void IndriIndex::_clearTermData   [protected]
 

void IndriIndex::_closeFields   [protected]
 

int IndriIndex::_compressTermData char *    buffer,
int    size,
indri::index::TermData   termData
[protected]
 

void IndriIndex::_computeMemoryBounds size_t    memorySize,
float    queryProportion
[protected]
 

void IndriIndex::_createDBs   [protected]
 

void IndriIndex::_createFields const std::vector< FieldDescription > &    fieldNames [protected]
 

indri::index::TermData * IndriIndex::_createTermData   [protected]
 

void IndriIndex::_decompressTermData const char *    buffer,
int    size,
indri::index::TermData   termData
[protected]
 

void IndriIndex::_deleteTermData indri::index::TermData   termData [protected]
 

indri::index::TermData * IndriIndex::_fetchTermData TERMID_T    termID [protected]
 

void IndriIndex::_finishDocument greedy_vector< indri::index::TermFieldStatistics * > &    seenStatistics [protected]
 

void IndriIndex::_flushTermStatistics TERMID_T    termID,
const indri::index::TermFieldStatistics   statistics
[protected]
 

void IndriIndex::_handleCache   [protected]
 

void IndriIndex::_incrementalWriteTermData TERMID_T    termID,
indri::index::TermData   termData
[protected]
 

int IndriIndex::_lookupTag const char *    tag [protected]
 

void IndriIndex::_lookupTerm const char *    term,
TERMID_T   termID,
indri::index::TermData *&    termData
[protected]
 

indri::index::TermData * IndriIndex::_lookupTermData TERMID_T    termID [protected]
 

void IndriIndex::_mergeBatch   [protected]
 

void IndriIndex::_mergeBatchSegments int    start,
int    end,
int    newNumber,
bool    finalMerge
[protected]
 

void IndriIndex::_mergeBatchTermLists const std::vector< int > &    segmentMapping [protected]
 

void IndriIndex::_mergeIncrementalSegments   [protected]
 

void IndriIndex::_openDBs   [protected]
 

void IndriIndex::_openDocumentFiles   [protected]
 

void IndriIndex::_openMergeFiles int    startSegment,
int    endSegment,
std::vector< File * > &    listFiles,
std::vector< File * > &    statsFiles,
std::vector< File * > &    mappingFiles,
std::vector< WriteBuffer * > &    mappingBuffers,
std::vector< ReadBuffer * > &    statsBuffers,
std::vector< indri::index::DocListFileIterator * > &    listIterators,
std::vector< char * > &    terms,
std::vector< indri::index::TermData * > &    termDatas,
bool    finalMerge
[protected]
 

void IndriIndex::_openReadOnlyDBs   [protected]
 

void IndriIndex::_openSegments   [protected]
 

bool IndriIndex::_readParameters const std::string &    fileName [protected]
 

bool IndriIndex::_readTermData TERMID_T   termID,
char *    termBuffer,
indri::index::TermData   termData,
ReadBuffer   termDataFile
[protected]
 

void IndriIndex::_readTermMapping greedy_vector< int > &    mapping,
int    segment,
int    secondSegment
[protected]
 

void IndriIndex::_removeClosedTags greedy_vector< indri::index::FieldExtent > &    tags,
unsigned int    position
[protected]
 

void IndriIndex::_resetEstimatePoint   [protected]
 

size_t IndriIndex::_sizeTermData   [protected]
 

void IndriIndex::_storeTermCache const char *    term,
TERMID_T    termID,
indri::index::TermData *&    termData
[protected]
 

int IndriIndex::_updateTermData int    documentLength [protected]
 

void IndriIndex::_updateTermlist TERMID_T    termID,
int    position
[protected]
 

void IndriIndex::_writeAndMerge   [protected]
 

void IndriIndex::_writeBatchSegment   [protected]
 

void IndriIndex::_writeCache   [protected]
 

void IndriIndex::_writeDocumentStatistics File::offset_type    offset,
int    byteLength,
int    indexedLength,
int    totalLength,
int    uniqueTerms
[protected]
 

void IndriIndex::_writeDocumentTermList File::offset_type   offset,
int &    byteLength,
DOCID_T    documentID,
int    documentLength,
indri::index::TermListBuilder   locatedTerms
[protected]
 

void IndriIndex::_writeIncrementalSegment   [protected]
 

void IndriIndex::_writeParameters const std::string &    fileName [protected]
 

DOCID_T IndriIndex::addDocument const char *    documentName,
const greedy_vector< char * > &    words,
const greedy_vector< TagExtent > &    tagExtents
 

add a parsed document to the index.

Parameters:
documentName  the name of the document to add
words  greedy vector of the terms in the document
tagExtents  greedy vector of the tag extents for the document.
Returns:
the internal document id of the document

DOCID_T IndriIndex::addDocument struct ParsedDocument   document
 

add a parsed document to the index.

Parameters:
document  the document to add
Returns:
the internal document id of the document

void IndriIndex::close  
 

Close the index.

bool IndriIndex::create const std::string &    indexName,
const std::vector< FieldDescription > &    fields
 

Create a new index with the given prefix and tag set.

bool IndriIndex::create const std::string &    indexName
 

Create a new index with the given prefix.

COUNT_T IndriIndex::docCount TERMID_T    termID const
 

Total counts of doc with a given term.

COUNT_T IndriIndex::docCount   const [inline]
 

Total count (i.e., number) of documents in collection.

indri::index::DocListFrequencyIterator * IndriIndex::docFrequencyInfoList TERMID_T    termID
 

doc entries in a term index without positions

COUNT_T IndriIndex::docIndexedLength DOCID_T    documentID const
 

return indexed length of the document

DocInfoList * IndriIndex::docInfoList TERMID_T    termID const
 

doc entries in a term index,

See also:
DocList

COUNT_T IndriIndex::docLength DOCID_T    documentID const
 

return length of the document

double IndriIndex::docLengthAvg  
 

Average document length.

DocPositionInfoList * IndriIndex::docPositionInfoList TERMID_T    termID
 

doc entries in a term index with positions

const EXDOCID_T IndriIndex::document DOCID_T    docID const
 

Convert a docID to its spelling.

DOCID_T IndriIndex::document const EXDOCID_T   docIDStr const
 

Convert a spelling to docID.

indri::index::DocumentData IndriIndex::fetchDocumentData int    key const [protected]
 

int IndriIndex::fetchDocumentLength int    key const [protected]
 

int IndriIndex::field const std::string &    fieldName
 

Convert a field name to its fieldID.

int IndriIndex::field const char *    fieldName
 

Convert a field name to its fieldID.

const char * IndriIndex::field int    fieldID
 

Convert a fieldID to its name.

INT64 IndriIndex::fieldDocCount int    fieldID,
TERMID_T    termID
const
 

Total count of documents that contain a given term in a given field.

INT64 IndriIndex::fieldDocCount int    fieldID const
 

Total count of documents that contain a given field.

indri::index::FieldListIterator * IndriIndex::fieldPositionListIterator int    fieldID
 

field list

INT64 IndriIndex::fieldTermCount int    fieldID const
 

Total counts of all terms in a field.

INT64 IndriIndex::fieldTermCount int    fieldID,
TERMID_T    termID
const
 

Total counts of a term in a field.

int IndriIndex::maxDocumentLength  
 

Maximum length of any document in the corpus.

bool IndriIndex::open const char *    indexName
 

Open previously created Index with given prefix.

bool IndriIndex::open const std::string &    indexName
 

Open previously created Index with given prefix.

bool IndriIndex::openRead const std::string &    indexName
 

Open previously created Index with given prefix in read only mode.

void IndriIndex::setName const std::string &    prefix
 

sets the name for this index

const TERM_T IndriIndex::term TERMID_T    termID const
 

Convert a termID to its spelling.

TERMID_T IndriIndex::term const TERM_T   word const
 

Convert a term spelling to a termID.

INT64 IndriIndex::termCount   const [inline]
 

Total counts of all terms in collection.

INT64 IndriIndex::termCount TERMID_T    termID const
 

Total counts of a term in collection.

COUNT_T IndriIndex::termCountUnique   const [inline]
 

Total count of unique terms in collection.

TermInfoList * IndriIndex::termInfoList DOCID_T    docID const
 

word entries in a document index (bag of words),

See also:
TermList

TermInfoList * IndriIndex::termInfoListSeq DOCID_T    docID const
 

word entries in a document index (sequence of words),

See also:
TermList

double IndriIndex::termMaxDocumentFraction TERMID_T    termID
 

Argmax over documents of (termCount/documentLength).

int IndriIndex::termMaxDocumentFrequency TERMID_T    termID
 

Maximum number of times this term is in any documents.

int IndriIndex::termMinDocumentLength TERMID_T    termID
 

Minimum length of any document containing this term.

indri::index::TermListBuilder * IndriIndex::termPositionList DOCID_T    docID
 

internal IndriIndex term list representation


Member Data Documentation

std::string IndriIndex::_baseName [protected]
 

bool IndriIndex::_batchBuild [protected]
 

int IndriIndex::_batchSegmentCount [protected]
 

HashTable<const char*, term_cache_entry*>* IndriIndex::_cache [protected]
 

in memory storage of data relating to terms -- partial inverted lists and statistics

indri::index::CorpusStatistics IndriIndex::_corpusStatistics [protected]
 

ReadBuffer* IndriIndex::_documentLengthBuffer [protected]
 

File IndriIndex::_documentLengthFile [protected]
 

KeyfileWordMap IndriIndex::_documentMap [protected]
 

ReadBuffer* IndriIndex::_documentStatisticsBuffer [protected]
 

File* IndriIndex::_documentStatisticsFile [protected]
 

File* IndriIndex::_documentTermLocationsFile [protected]
 

INT64 IndriIndex::_estimatePoint [protected]
 

std::vector<indri::index::FieldData*> IndriIndex::_fieldData [protected]
 

std::map<const char*, int, less_string> IndriIndex::_fieldLookup [protected]
 

INT64 IndriIndex::_lastCacheFlush [protected]
 

number of terms in the index when we should next check on flushing the inverted lists

size_t IndriIndex::_lengthBufferSize [protected]
 

size_t IndriIndex::_listsSize [protected]
 

size_t IndriIndex::_memorySize [protected]
 

float IndriIndex::_queryProportion [protected]
 

bool IndriIndex::_readOnly [protected]
 

greedy_vector<indri::index::TermData*> IndriIndex::_seenTerms [protected]
 

std::vector<File*> IndriIndex::_segments [protected]
 

size_t IndriIndex::_statisticsBufferSize [protected]
 

size_t IndriIndex::_termCacheSize [protected]
 

size_t IndriIndex::_termDataSize [protected]
 

Keyfile IndriIndex::_termDataStore [protected]
 

HashTable<int, indri::index::TermData*>* IndriIndex::_termDataTable [protected]
 

indri::index::TermListBuilder IndriIndex::_termList [protected]
 

Buffer IndriIndex::_termListBuffer [protected]
 

KeyfileWordMap IndriIndex::_termMap [protected]
 

bool IndriIndex::_writingDocTermLists [protected]
 

the prefix name


The documentation for this class was generated from the following files:
Generated on Wed Nov 3 12:59:39 2004 for Lemur Toolkit by doxygen1.2.18