public class TokenizerList extends Tokenizer implements java.io.Serializable
positions, tokenVector
Constructor and Description |
---|
TokenizerList()
Constructor that initializes an empty list of tokenizers
|
TokenizerList(java.util.List<java.lang.String> tokenizerNames)
Constructor that takes a list of the tokenizer names
|
Modifier and Type | Method and Description |
---|---|
void |
addTokenizer(java.lang.String tokenizerName)
Adds a tokenizer to the end of this list of tokenizers.
|
void |
addTokenizerNames(java.util.List<java.lang.String> tokenizerNames)
Adds tokenizers to the end of this list of tokenizers
|
void |
clear()
Clears this list of tokenizers
|
void |
disableMangler()
Disables the mangler in the FileManglerTokenizer object in this list
|
void |
enableMangler(java.lang.String settings)
Enables the mangler with the given settings in the FileManglerTokenizer object in this list
|
void |
enableMangler(java.lang.String settings,
java.util.List<java.lang.String> tokens)
Enables the mangler with the given settings in the FileManglerTokenizer object in this list
|
java.util.Iterator<java.lang.String> |
getIterator()
Provides the string iterator over this document's tokens from the last tokenizer from this list
|
java.util.List<java.lang.String> |
getNames()
Provides the ordered list of the tokenizer names in this list
|
java.util.List<Tokenizer> |
getTokenizers()
Provides the ordered list of the tokenizers in this list
|
void |
printTokenizerNames()
Prints the ordered list of the tokenizer names in this list
|
void |
setManglerRNG(java.util.Random random)
Sets the random number generator to use with a FileManglerTokenizer object in this list
|
protected Pair<java.util.Iterator<java.lang.String>,java.util.Iterator<Pair<java.lang.Integer,java.lang.Integer>>> |
split_tokenize(java.util.Iterator<java.lang.String> tokens_iterator,
java.util.Iterator<Pair<java.lang.Integer,java.lang.Integer>> positions_iterator)
Applies each tokenizer from this list, in order, on the tokens provided by a Splitter object; no tokenizers that
are able to read from a file should be present in this list
|
java.util.Iterator<java.lang.String> |
tokenize()
Applies each tokenizer from this list, in order, on the tokens; the first tokenizer must be able to read from a
file and create the list of tokens
|
java.util.Iterator<java.lang.String> |
tokenizeFile(java.io.File file)
Applies each tokenizer from this list, in order, on the file; the first tokenizer must be able to read from a file
and the tokenizers must already be instantiated
|
java.util.Iterator<java.lang.String> |
tokenizeFile(java.lang.String filename)
Applies each tokenizer from this list, in order, on the file; the first tokenizer must be able to read from a file
and the tokenizers must already be instantiated
|
getPositionsVector, getTokenVector, iterator, position_iterator, printTokens, tokenize, tokenize, tokenize
public TokenizerList()
public TokenizerList(java.util.List<java.lang.String> tokenizerNames)
tokenizerNames
- the string names of the tokenizers to use; the first tokenizer must be able to read from a filepublic void enableMangler(java.lang.String settings)
settings
- the string containing the mangler settingspublic void enableMangler(java.lang.String settings, java.util.List<java.lang.String> tokens)
settings
- the string containing the mangler settingstokens
- the string list containing the tokens to put through this manglerpublic void disableMangler()
public void setManglerRNG(java.util.Random random)
random
- the random number generator to use if there is a fileManglerTokenizer object in this listpublic void addTokenizer(java.lang.String tokenizerName)
tokenizerName
- the string name of the tokenizer to be added to the end of this listpublic void addTokenizerNames(java.util.List<java.lang.String> tokenizerNames)
tokenizerNames
- the list of tokenizers names to use; the first tokenizer must be able to read from a filepublic void clear()
public java.util.List<java.lang.String> getNames()
public java.util.List<Tokenizer> getTokenizers()
public void printTokenizerNames()
public java.util.Iterator<java.lang.String> tokenizeFile(java.lang.String filename)
filename
- the string name of the file to tokenizepublic java.util.Iterator<java.lang.String> tokenizeFile(java.io.File file)
file
- the file to tokenizepublic java.util.Iterator<java.lang.String> tokenize()
protected Pair<java.util.Iterator<java.lang.String>,java.util.Iterator<Pair<java.lang.Integer,java.lang.Integer>>> split_tokenize(java.util.Iterator<java.lang.String> tokens_iterator, java.util.Iterator<Pair<java.lang.Integer,java.lang.Integer>> positions_iterator)
tokens_iterator
- the string iterator over the token elementspositions_iterator
- the integer pair iterator over the start and end position elementspublic java.util.Iterator<java.lang.String> getIterator()