001package org.nasdanika.ai.cli; 002 003import java.util.Collection; 004 005import org.nasdanika.common.Description; 006 007import com.github.jelmerk.hnswlib.core.DistanceFunction; 008import com.github.jelmerk.hnswlib.core.Index; 009import com.github.jelmerk.hnswlib.core.Item; 010import com.github.jelmerk.hnswlib.core.ProgressListener; 011import com.github.jelmerk.hnswlib.core.hnsw.HnswIndex; 012import com.github.jelmerk.hnswlib.core.hnsw.HnswIndex.Builder; 013 014import io.opentelemetry.api.common.Attributes; 015import io.opentelemetry.api.common.AttributesBuilder; 016import io.opentelemetry.api.trace.Span; 017import picocli.CommandLine.Option; 018 019public abstract class HnswIndexBuilderArgGroup<TVector, TDistance extends Comparable<TDistance>> { 020 021 private static final int AVAILABLE_PROCESSORS = Runtime.getRuntime().availableProcessors(); 022 023 @Option( 024 names = "--hnsw-ef", 025 description = { 026 "Size of the dynamic list for the nearest neighbors", 027 "Default value: ${DEFAULT-VALUE}" 028 }, 029 defaultValue = "200") 030 @Description( 031 """ 032 The size of the dynamic list for the nearest neighbors (used during the search). 033 Higher ``ef`` leads to more accurate but slower search. 034 The value ef of can be anything between ``k`` (number of items to return from search) and the size of the dataset. 035 036 [^ef-javadoc]: [ef javadoc](https://javadoc.io/static/com.github.jelmerk/hnswlib-core/1.2.0/com/github/jelmerk/hnswlib/core/hnsw/HnswIndex.BuilderBase.html#withEf(int)) 037 """) 038 protected int ef; 039 040 @Option( 041 names = "--hnsw-ef-contruction", 042 description = { 043 "Controls the index time / index precision", 044 "Default value: ${DEFAULT-VALUE}" 045 }, 046 defaultValue = "200") 047 @Description( 048 """ 049 The option has the same meaning as ``--hnsw-ef``, but controls the index time / index precision. 050 Bigger ``ef-construction`` leads to longer construction, but better index quality. 051 At some point, increasing ``ef-construction`` does not improve the quality of the index. 052 One way to check if the selection of ``ef-construction`` was ok is to measure a recall for 053 ``M`` nearest neighbor search when ``ef = ef-construction``: if the recall is lower than ``0.9``, 054 then there is room for improvement. 055 056 [^ef-construction-javadoc]: [ef-construction javadoc](https://javadoc.io/static/com.github.jelmerk/hnswlib-core/1.2.0/com/github/jelmerk/hnswlib/core/hnsw/HnswIndex.BuilderBase.html#withEfConstruction(int)) 057 """) 058 protected int efConstruction; 059 060 @Option( 061 names = "--hnsw-m", 062 description = { 063 "The number of bi-directional links created", 064 "for every new element during construction", 065 "Default value: ${DEFAULT-VALUE}" 066 }, 067 defaultValue = "16") 068 @Description( 069 """ 070 Sets the number of bi-directional links created for every new element during construction. 071 Reasonable range for m is ``2-100``. Higher m work better on datasets with high intrinsic dimensionality and/or high recall, 072 while low m work better for datasets with low intrinsic dimensionality and/or low recalls. 073 The parameter also determines the algorithm's memory consumption. 074 As an example for ``d = 4`` random vectors optimal ``m`` for search is somewhere around ``6``, 075 while for high dimensional datasets (word embeddings, good face descriptors), 076 higher ``m`` are required (e.g. ``m = 48, 64``) for optimal performance at high recall. 077 The range ``m = 12-48`` is ok for the most of the use cases. 078 When ``m`` is changed one has to update the other parameters. 079 Nonetheless, ``ef`` and ``efConstruction`` parameters can be roughly estimated by 080 assuming that ``m`` ``efConstruction`` is a constant[^m-javadoc]. 081 082 [^m-javadoc]: [m javadoc](https://javadoc.io/static/com.github.jelmerk/hnswlib-core/1.2.0/com/github/jelmerk/hnswlib/core/hnsw/HnswIndex.BuilderBase.html#withM(int)) 083 """) 084 protected int m; 085 086 @Option( 087 names = "--hnsw-remove-enabled", 088 description = "If true, removal from the index is enabled" 089 ) 090 protected boolean removeEnabled; 091 092 093 @Option( 094 names = "--hnsw-threads", 095 description = { 096 "Number of threads to use for parallel indexing", 097 "Default to the number of available processors" 098 }) 099 private int threads = AVAILABLE_PROCESSORS; 100 101 @Option( 102 names = "--hnsw-progress-update-interval", 103 description = { 104 "After indexing this many items progress will be", 105 "reported. The last element will always be", 106 "reported regardless of this setting. ", 107 "Default value: " + Index.DEFAULT_PROGRESS_UPDATE_INTERVAL 108 }) 109 private int progressUpdateInterval = Index.DEFAULT_PROGRESS_UPDATE_INTERVAL; 110 111 public HnswIndex.Builder<TVector, TDistance> createIndexBuilder(int dimensions, int maxItemCount) { 112 Builder<TVector, TDistance> builder = HnswIndex.newBuilder(dimensions, getDistanceFunction(), maxItemCount) 113 .withM(m) 114 .withEf(ef) 115 .withEfConstruction(efConstruction); 116 117 if (removeEnabled) { 118 builder.withRemoveEnabled(); 119 } 120 121 return builder; 122 } 123 124 protected abstract DistanceFunction<TVector, TDistance> getDistanceFunction(); 125 126 public void setSpanAttributes(Span span) { 127 span.setAttribute("hnsw.ef", ef); 128 span.setAttribute("hnsw.ef-construction", efConstruction); 129 span.setAttribute("hnsw.m", m); 130 span.setAttribute("hnsw.remove-enabled", removeEnabled); 131 } 132 133 public <TId, TItem extends Item<TId, TVector>> HnswIndex<TId, TVector, TItem, TDistance> buildAndAddAll( 134 int dimensions, 135 Collection<TItem> items, 136 Span span) throws InterruptedException { 137 HnswIndex<TId, TVector, TItem, TDistance> index = createIndexBuilder(dimensions, items.size()).build(); 138 ProgressListener progressListener = (workDone, max) -> { 139 AttributesBuilder ab = Attributes.builder(); 140 ab.put("done", workDone); 141 ab.put("total", max); 142 ab.put("percent", (100L * workDone) / max); 143 span.addEvent("hnsw.progress", ab.build()); 144 }; 145 index.addAll(items, threads, progressListener, progressUpdateInterval); 146 return index; 147 } 148 149}