public class Tokenizer extends java.lang.Object implements java.io.Serializable, java.lang.Iterable<java.util.Map.Entry<java.lang.String,java.util.List<Token>>>
Modifier and Type | Field and Description |
---|---|
protected java.util.Map<java.lang.String,java.util.List<Token>> |
tokenVectorMap
The map containing the splitter name and its corresponding list of token elements
|
Constructor and Description |
---|
Tokenizer() |
Modifier and Type | Method and Description |
---|---|
java.util.Map<java.lang.String,java.util.List<Token>> |
getTokenVectorMap()
Provides the list of each token in order of its appearance
|
java.util.Iterator<java.util.Map.Entry<java.lang.String,java.util.List<Token>>> |
iterator() |
void |
printTokens()
Prints each token to the system output stream using the UTF-8 charset
|
java.util.List<Token> |
tokenize(java.util.List<Token> tokenVector)
Alters or eliminates certain tokens.
|
java.util.List<Token> |
tokenize(java.lang.String filename)
Splits the document into tokens.
|
java.lang.String |
toString() |
protected java.util.Map<java.lang.String,java.util.List<Token>> tokenVectorMap
public java.util.List<Token> tokenize(java.util.List<Token> tokenVector)
tokenVector
- the vector of token elements to tokenizepublic java.util.List<Token> tokenize(java.lang.String filename)
filename
- the filename of the document to split into tokenspublic java.util.Map<java.lang.String,java.util.List<Token>> getTokenVectorMap()
public void printTokens()
public java.util.Iterator<java.util.Map.Entry<java.lang.String,java.util.List<Token>>> iterator()
iterator
in interface java.lang.Iterable<java.util.Map.Entry<java.lang.String,java.util.List<Token>>>
public java.lang.String toString()
toString
in class java.lang.Object