public final class StringUtils extends Object
| Modifier and Type | Field and Description |
|---|---|
static Pattern |
doubleBar
Pattern to remove double bars from disjunct regex.
|
| Modifier and Type | Method and Description |
|---|---|
static void |
computeShortestEditScript(String wordForm,
String lemma,
int[][] distance,
StringBuffer permutations)
Computes the Shortest Edit Script (SES) to convert a word into its lemma.
|
static String[] |
convertListTokenToArrayStrings(List<Token> tokenizedSentence)
Convert a list of token objects (e.g.
|
static String |
createDisjunctRegexFromList(List<String> words) |
static String[] |
decodeLemmas(String[] tokens,
Span[] preds)
Decodes the lemma from the word and the induced lemma class.
|
static void |
decodeLemmasToSpans(String[] tokens,
Span[] preds)
Decodes the lemma induced type into the lemma and sets it as value of the
Span type.
|
static String |
decodeShortestEditScript(String wordForm,
String permutations)
Read predicted SES by the lemmatizer model and apply the permutations to
obtain the lemma from the wordForm.
|
static List<Integer> |
exactStringFinder(String pattern,
String sentence)
Finds a pattern (typically a named entity string) in a sentence string.
|
static List<Integer> |
exactTokenFinder(String pattern,
String[] tokens)
Finds a pattern (typically a named entity string) in a tokenized sentence.
|
static List<Integer> |
exactTokenFinderIgnoreCase(String pattern,
String[] tokens)
Finds a pattern (typically a named entity string) in a tokenized sentence.
|
static List<File> |
getFilesInDir(File inputPath)
Recursively get every file in a directory and add them to a list.
|
static String |
getSetStringFromList(List<String> posLemmaValues) |
static String |
getShortestEditScript(String wordForm,
String lemma)
Get the SES required to go from a word to a lemma.
|
static String |
getStringFromTokens(String[] tokens)
Gets the String joined by a space of an array of tokens.
|
static int[][] |
levenshteinDistance(String wordForm,
String lemma)
Computes the Levenshtein distance of two strings in a matrix.
|
static void |
splitLine(String line,
char delimiter,
String[] splitted)
Fast line splitting with a separator, typically a tab or space character.
|
public static Pattern doubleBar
public static List<Integer> exactTokenFinderIgnoreCase(String pattern, String[] tokens)
Span indexes of the named entity found, if any.pattern - a string to findtokens - an array of tokenspublic static List<Integer> exactTokenFinder(String pattern, String[] tokens)
Span indexes of the named entity found, if anypattern - a string to findtokens - an array of tokenspublic static List<Integer> exactStringFinder(String pattern, String sentence)
pattern - the pattern to be searchedsentence - the sentencepublic static String getStringFromTokens(String[] tokens)
tokens - an array of tokens representing a tokenized sentencepublic static List<File> getFilesInDir(File inputPath)
inputPath - the input directorypublic static int[][] levenshteinDistance(String wordForm, String lemma)
wordForm - the formlemma - the lemmapublic static void computeShortestEditScript(String wordForm, String lemma, int[][] distance, StringBuffer permutations)
wordForm - the tokenlemma - the target lemmadistance - the levenshtein distancepermutations - the number of permutationspublic static String decodeShortestEditScript(String wordForm, String permutations)
wordForm - the wordFormpermutations - the permutations predicted by the lemmatizer modelpublic static String getShortestEditScript(String wordForm, String lemma)
wordForm - the wordlemma - the lemmapublic static String[] decodeLemmas(String[] tokens, Span[] preds)
tokens - the array of tokenspreds - the predicted lemma classespublic static void decodeLemmasToSpans(String[] tokens, Span[] preds)
tokens - the tokens in the sentencepreds - the predicted spanspublic static String[] convertListTokenToArrayStrings(List<Token> tokenizedSentence)
tokenizedSentence - the list of token objectsCopyright © 2017 IXA pipes. All rights reserved.