Class LuceneCandidateRetrieval

    • Field Detail

      • CONJUNCTION

        public static final de.julielab.geneexpbase.candidateretrieval.QueryGenerator CONJUNCTION
      • DISJUNCTION

        public static final de.julielab.geneexpbase.candidateretrieval.QueryGenerator DISJUNCTION
      • DISJUNCTION_MINUS_1

        public static final de.julielab.geneexpbase.candidateretrieval.QueryGenerator DISJUNCTION_MINUS_1
      • DISJUNCTION_MINUS_2

        public static final de.julielab.geneexpbase.candidateretrieval.QueryGenerator DISJUNCTION_MINUS_2
      • NGRAM_2_3

        public static final de.julielab.geneexpbase.candidateretrieval.QueryGenerator NGRAM_2_3
      • GENE_RECORDS_CNF

        public static final de.julielab.geneexpbase.candidateretrieval.QueryGenerator GENE_RECORDS_CNF
        Conjunctive normal form query where all tokens must be found in any field.
      • GENE_RECORDS_CNF_WITH_SYNONYMS

        public static final de.julielab.geneexpbase.candidateretrieval.QueryGenerator GENE_RECORDS_CNF_WITH_SYNONYMS
        Just like GENE_RECORDS_CNF but with an additional BooleanClause.Occur.SHOULD clause only for synonym matches created via GeneRecordSynonymsQueryGenerator.
      • GENE_RECORDS_FLAT_DISJUNCTION

        public static final de.julielab.geneexpbase.candidateretrieval.QueryGenerator GENE_RECORDS_FLAT_DISJUNCTION
        Puts all tokens on all fields in one large disjunction. Thus, not every token needs to match. Used for context scoring of gene names.
      • GENE_RECORDS_DISMAX

        public static final de.julielab.geneexpbase.candidateretrieval.QueryGenerator GENE_RECORDS_DISMAX
      • GENE_RECORDS_SYNONYMS_APPROX

        public static final de.julielab.geneexpbase.candidateretrieval.QueryGenerator GENE_RECORDS_SYNONYMS_APPROX
      • GENE_RECORDS_SYNONYMS_EXACT

        public static final de.julielab.geneexpbase.candidateretrieval.QueryGenerator GENE_RECORDS_SYNONYMS_EXACT
      • TOKEN_JAROWINKLER_SCORER

        public static final int TOKEN_JAROWINKLER_SCORER
        See Also:
        Constant Field Values
      • candidateLog

        public static final org.slf4j.Logger candidateLog
      • LUCENE_MAX_HITS

        public static final int LUCENE_MAX_HITS
        the maximal number of hits lucene returns for a query
        See Also:
        Constant Field Values
      • UNIT_TEST_GENE_ID_ACCUMULATION_SET

        public static Set<String> UNIT_TEST_GENE_ID_ACCUMULATION_SET
    • Constructor Detail

      • LuceneCandidateRetrieval

        @Deprecated
        public LuceneCandidateRetrieval​(org.apache.lucene.search.IndexSearcher mentionIndexSearcher,
                                        de.julielab.geneexpbase.scoring.Scorer scorer)
        Deprecated.
      • LuceneCandidateRetrieval

        @Inject
        public LuceneCandidateRetrieval​(Configuration config,
                                        ExecutorService executorService,
                                        de.julielab.geneexpbase.services.CacheService cacheService)
                                 throws de.julielab.geneexpbase.candidateretrieval.GeneCandidateRetrievalException
        Throws:
        de.julielab.geneexpbase.candidateretrieval.GeneCandidateRetrievalException
    • Method Detail

      • getTotalCacheGettime

        public static AtomicLong getTotalCacheGettime()
      • getTotalGeneRecordFieldLoadingTime

        public static AtomicLong getTotalGeneRecordFieldLoadingTime()
      • getTotalCachePuttime

        public static AtomicLong getTotalCachePuttime()
      • getTotalLuceneQueryTime

        public static AtomicLong getTotalLuceneQueryTime()
      • getCacheMisses

        public static AtomicLong getCacheMisses()
      • getCacheHits

        public static AtomicLong getCacheHits()
      • getTFIDFOnGeneSynonyms

        public de.julielab.geneexpbase.scoring.TFIDFScorer getTFIDFOnGeneSynonyms()
      • getNormalizer

        public de.julielab.geneexpbase.TermNormalizer getNormalizer()
      • setNormalizer

        public void setNormalizer​(de.julielab.geneexpbase.TermNormalizer normalizer)
      • getScorer

        public de.julielab.geneexpbase.scoring.Scorer getScorer()
      • setScorerType

        public de.julielab.geneexpbase.scoring.Scorer setScorerType​(int type)
                                                             throws de.julielab.geneexpbase.candidateretrieval.GeneCandidateRetrievalException
        Throws:
        de.julielab.geneexpbase.candidateretrieval.GeneCandidateRetrievalException
      • getScorerInfo

        public String getScorerInfo()
      • getScorerType

        public int getScorerType()
      • getCandidates

        public List<de.julielab.geneexpbase.candidateretrieval.SynHit> getCandidates​(String originalSearchTerm,
                                                                                     de.julielab.geneexpbase.candidateretrieval.QueryGenerator queryGenerator)
        Specified by:
        getCandidates in interface CandidateRetrieval
      • getCandidates

        public List<de.julielab.geneexpbase.candidateretrieval.SynHit> getCandidates​(de.julielab.geneexpbase.genemodel.GeneMention geneMention,
                                                                                     de.julielab.geneexpbase.candidateretrieval.QueryGenerator queryGenerator)
        Specified by:
        getCandidates in interface de.julielab.geneexpbase.candidateretrieval.CandidateRetrieval
      • getCandidates

        public List<de.julielab.geneexpbase.candidateretrieval.SynHit> getCandidates​(de.julielab.geneexpbase.genemodel.GeneMention geneMention,
                                                                                     Collection<String> organisms,
                                                                                     de.julielab.geneexpbase.candidateretrieval.QueryGenerator queryGenerator)
        Specified by:
        getCandidates in interface de.julielab.geneexpbase.candidateretrieval.CandidateRetrieval
      • getCandidates

        public List<de.julielab.geneexpbase.candidateretrieval.SynHit> getCandidates​(de.julielab.geneexpbase.genemodel.GeneMention geneMention,
                                                                                     Collection<String> geneIdsFilter,
                                                                                     Collection<String> organisms,
                                                                                     de.julielab.geneexpbase.candidateretrieval.QueryGenerator queryGenerator)
        Specified by:
        getCandidates in interface de.julielab.geneexpbase.candidateretrieval.CandidateRetrieval
      • getCandidates

        public List<de.julielab.geneexpbase.candidateretrieval.SynHit> getCandidates​(de.julielab.geneexpbase.genemodel.GeneMention geneMention,
                                                                                     Collection<String> geneIdsFilter,
                                                                                     Collection<String> organisms,
                                                                                     boolean loadFields,
                                                                                     de.julielab.geneexpbase.configuration.Parameters parameters,
                                                                                     de.julielab.geneexpbase.candidateretrieval.QueryGenerator queryGenerator)
        Specified by:
        getCandidates in interface CandidateRetrieval
      • getCandidates

        public List<de.julielab.geneexpbase.candidateretrieval.SynHit> getCandidates​(String geneMentionText,
                                                                                     Collection<String> geneIdsFilter,
                                                                                     Collection<String> organism,
                                                                                     boolean loadFields,
                                                                                     de.julielab.geneexpbase.candidateretrieval.QueryGenerator queryGenerator)
      • getCandidates

        public List<de.julielab.geneexpbase.candidateretrieval.SynHit> getCandidates​(de.julielab.geneexpbase.genemodel.GeneMention geneMention,
                                                                                     String organism,
                                                                                     de.julielab.geneexpbase.candidateretrieval.QueryGenerator queryGenerator)
        Specified by:
        getCandidates in interface CandidateRetrieval
      • getCandidates

        public List<de.julielab.geneexpbase.candidateretrieval.SynHit> getCandidates​(String geneMentionText,
                                                                                     String organism,
                                                                                     de.julielab.geneexpbase.candidateretrieval.QueryGenerator queryGenerator)
        Specified by:
        getCandidates in interface CandidateRetrieval
      • getCandidates

        public List<de.julielab.geneexpbase.candidateretrieval.SynHit> getCandidates​(String geneMentionText,
                                                                                     Collection<String> organism,
                                                                                     de.julielab.geneexpbase.candidateretrieval.QueryGenerator queryGenerator)
        Specified by:
        getCandidates in interface CandidateRetrieval
      • mapGeneIdToTaxId

        public String mapGeneIdToTaxId​(String geneId)
        Specified by:
        mapGeneIdToTaxId in interface de.julielab.geneexpbase.candidateretrieval.CandidateRetrieval
      • getIndexRecords

        public List<de.julielab.geneexpbase.candidateretrieval.SynHit> getIndexRecords​(Collection<String> ids)
        Returns GeneRecordHit instances with all fields loaded.
        Parameters:
        ids - IDs of the gene records to return.
        Returns:
        The records for the given IDs.
      • getIndexRecords

        public List<de.julielab.geneexpbase.candidateretrieval.SynHit> getIndexRecords​(Collection<String> ids,
                                                                                       org.apache.lucene.search.IndexSearcher indexSearcher)
      • getIndexRecords

        public List<de.julielab.geneexpbase.candidateretrieval.SynHit> getIndexRecords​(Collection<String> ids,
                                                                                       de.julielab.geneexpbase.genemodel.GeneName geneName,
                                                                                       Function<de.julielab.geneexpbase.genemodel.GeneName,​String> geneNameFunc,
                                                                                       org.apache.lucene.search.IndexSearcher indexSearcher)
        Parameters:
        ids - The gene IDs of the index items to retrieve.
        geneName - The gene name to add as the mapped mention and to use to find the synonym matching the gene name best.
        geneNameFunc - The function to be applied to geneName in order to retrieve a string for comparison.
        indexSearcher - The index searcher to use.
        Returns:
        The found SynHits matching the input IDs.
      • scoreSynonymsRecordIndex

        public org.apache.commons.lang3.tuple.Pair<Map<String,​Double>,​Map<String,​Set<String>>> scoreSynonymsRecordIndex​(String queryType,
                                                                                                                                          Map<String,​Collection<de.julielab.geneexpbase.genemodel.GeneName>> ids2entities,
                                                                                                                                          Function<GeneRecordHit,​String[]> synhit2namesFunc,
                                                                                                                                          de.julielab.geneexpbase.candidateretrieval.QueryGenerator qg)

        Scores each synonym in allSynonym against the IDs in ids.

        Each resulting SynHit adds its mention score to the ID represented by this SynHit.

        Specified by:
        scoreSynonymsRecordIndex in interface CandidateRetrieval
        Parameters:
        queryType -
        ids2entities -
        qg -
        Returns:
      • getCandidates

        public List<de.julielab.geneexpbase.candidateretrieval.SynHit> getCandidates​(de.julielab.geneexpbase.genemodel.GeneMention gm,
                                                                                     Collection<String> taxId,
                                                                                     de.julielab.geneexpbase.configuration.Parameters parameters,
                                                                                     de.julielab.geneexpbase.candidateretrieval.QueryGenerator queryGenerator)
        Specified by:
        getCandidates in interface CandidateRetrieval
      • getFamilyNames

        public List<de.julielab.geneexpbase.candidateretrieval.SynHit> getFamilyNames​(de.julielab.geneexpbase.genemodel.GeneMention gm,
                                                                                      de.julielab.geneexpbase.candidateretrieval.QueryGenerator queryGenerator)
        Description copied from interface: CandidateRetrieval
        Searches the index for the given gene mention filtered for family names.
        Specified by:
        getFamilyNames in interface CandidateRetrieval
        Parameters:
        gm - The gene mention to check for family names.
        queryGenerator - The query generator to use.
        Returns:
      • setFulltextFieldsToRecordHits

        public void setFulltextFieldsToRecordHits​(Collection<? extends de.julielab.geneexpbase.candidateretrieval.SynHit> recordHits,
                                                  Collection<String> fieldsToLoad)

        Sets the full text / gene context fields (generif, summary, interactions) to instances of GeneRecordHit.

        Note that this method accepts plain SynHit instances for convenience. But the actual objects must be GeneRecordHits.

        Specified by:
        setFulltextFieldsToRecordHits in interface CandidateRetrieval
        Parameters:
        recordHits - The GeneRecordHits to set the full text / gene context values for.
        fieldsToLoad - The gene context fields to load and set. Must be included in fullTextFieldSetter.