001package org.nasdanika.ai.cli;
002
003import java.util.Collection;
004
005import org.nasdanika.common.Description;
006
007import com.github.jelmerk.hnswlib.core.DistanceFunction;
008import com.github.jelmerk.hnswlib.core.Index;
009import com.github.jelmerk.hnswlib.core.Item;
010import com.github.jelmerk.hnswlib.core.ProgressListener;
011import com.github.jelmerk.hnswlib.core.hnsw.HnswIndex;
012import com.github.jelmerk.hnswlib.core.hnsw.HnswIndex.Builder;
013
014import io.opentelemetry.api.common.Attributes;
015import io.opentelemetry.api.common.AttributesBuilder;
016import io.opentelemetry.api.trace.Span;
017import picocli.CommandLine.Option;
018
019public abstract class HnswIndexBuilderArgGroup<TVector, TDistance extends Comparable<TDistance>> {
020                
021        private static final int AVAILABLE_PROCESSORS = Runtime.getRuntime().availableProcessors();
022
023        @Option( 
024                        names = "--hnsw-ef",
025                        description = {
026                                        "Size of the dynamic list for the nearest neighbors",
027                                        "Default value: ${DEFAULT-VALUE}"
028                        },
029                        defaultValue = "200")   
030        @Description(
031                        """
032                        The size of the dynamic list for the nearest neighbors (used during the search). 
033                        Higher ``ef`` leads to more accurate but slower search. 
034                        The value ef of can be anything between ``k`` (number of items to return from search) and the size of the dataset.
035                        
036                        [^ef-javadoc]: [ef javadoc](https://javadoc.io/static/com.github.jelmerk/hnswlib-core/1.2.0/com/github/jelmerk/hnswlib/core/hnsw/HnswIndex.BuilderBase.html#withEf(int))                                                                        
037                        """)
038        protected int ef;       
039        
040        @Option( 
041                        names = "--hnsw-ef-contruction",
042                        description = {
043                                        "Controls the index time / index precision",
044                                        "Default value: ${DEFAULT-VALUE}"
045                        },
046                        defaultValue = "200")   
047        @Description(
048                        """
049                        The option has the same meaning as ``--hnsw-ef``, but controls the index time / index precision. 
050                        Bigger ``ef-construction`` leads to longer construction, but better index quality. 
051                        At some point, increasing ``ef-construction`` does not improve the quality of the index. 
052                        One way to check if the selection of ``ef-construction`` was ok is to measure a recall for 
053                        ``M`` nearest neighbor search when ``ef = ef-construction``: if the recall is lower than ``0.9``,
054                        then there is room for improvement.
055                        
056                        [^ef-construction-javadoc]: [ef-construction javadoc](https://javadoc.io/static/com.github.jelmerk/hnswlib-core/1.2.0/com/github/jelmerk/hnswlib/core/hnsw/HnswIndex.BuilderBase.html#withEfConstruction(int))                                                                  
057                        """)
058        protected int efConstruction;   
059        
060        @Option( 
061                        names = "--hnsw-m",
062                        description = {
063                                        "The number of bi-directional links created",
064                                        "for every new element during construction",
065                                        "Default value: ${DEFAULT-VALUE}"
066                        },
067                        defaultValue = "16")    
068        @Description(
069                        """
070                        Sets the number of bi-directional links created for every new element during construction.
071                        Reasonable range for m is ``2-100``. Higher m work better on datasets with high intrinsic dimensionality and/or high recall,
072                        while low m work better for datasets with low intrinsic dimensionality and/or low recalls.
073                        The parameter also determines the algorithm's memory consumption.
074                        As an example for ``d = 4`` random vectors optimal ``m`` for search is somewhere around ``6``,
075                        while for high dimensional datasets (word embeddings, good face descriptors), 
076                        higher ``m`` are required (e.g. ``m = 48, 64``) for optimal     performance at high recall. 
077                        The range ``m = 12-48`` is ok for the most of the use cases. 
078                        When ``m`` is changed one has to update the other parameters. 
079                        Nonetheless, ``ef`` and ``efConstruction`` parameters can be roughly estimated by
080                        assuming that ``m``  ``efConstruction`` is a constant[^m-javadoc].
081                        
082                        [^m-javadoc]: [m javadoc](https://javadoc.io/static/com.github.jelmerk/hnswlib-core/1.2.0/com/github/jelmerk/hnswlib/core/hnsw/HnswIndex.BuilderBase.html#withM(int)) 
083                        """)
084        protected int m;        
085        
086        @Option( 
087                names = "--hnsw-remove-enabled",
088                description = "If true, removal from the index is enabled"
089                        )       
090        protected boolean removeEnabled;
091
092        
093        @Option( 
094                names = "--hnsw-threads",
095                description = {
096                                "Number of threads to use for parallel indexing",
097                                "Default to the number of available processors"
098                })      
099        private int threads = AVAILABLE_PROCESSORS;
100
101        @Option( 
102                        names = "--hnsw-progress-update-interval",
103                        description = {
104                                        "After indexing this many items progress will be",
105                                        "reported. The last element will always be",
106                                        "reported regardless of this setting. ",
107                                        "Default value: " + Index.DEFAULT_PROGRESS_UPDATE_INTERVAL
108                        })              
109        private int progressUpdateInterval = Index.DEFAULT_PROGRESS_UPDATE_INTERVAL;    
110        
111        public HnswIndex.Builder<TVector, TDistance> createIndexBuilder(int dimensions, int maxItemCount) {
112                Builder<TVector, TDistance> builder = HnswIndex.newBuilder(dimensions, getDistanceFunction(), maxItemCount)
113                .withM(m)
114                .withEf(ef)
115                .withEfConstruction(efConstruction);
116                
117                if (removeEnabled) {
118                        builder.withRemoveEnabled();
119                }
120                
121                return builder;
122        }
123
124        protected abstract DistanceFunction<TVector, TDistance> getDistanceFunction();
125        
126        public void setSpanAttributes(Span span) {
127                span.setAttribute("hnsw.ef", ef);
128                span.setAttribute("hnsw.ef-construction", efConstruction);
129                span.setAttribute("hnsw.m", m);
130                span.setAttribute("hnsw.remove-enabled", removeEnabled);                
131        }
132                
133        public <TId, TItem extends Item<TId, TVector>> HnswIndex<TId, TVector, TItem, TDistance> buildAndAddAll(
134                        int dimensions, 
135                        Collection<TItem> items,
136                        Span span) throws InterruptedException {
137                HnswIndex<TId, TVector, TItem, TDistance> index = createIndexBuilder(dimensions, items.size()).build();
138                ProgressListener progressListener = (workDone, max) -> {
139                        AttributesBuilder ab = Attributes.builder();
140                        ab.put("done", workDone);
141                        ab.put("total", max);
142                        ab.put("percent", (100L * workDone) / max);                     
143                        span.addEvent("hnsw.progress", ab.build());
144                };
145                index.addAll(items, threads, progressListener, progressUpdateInterval);
146                return index;           
147        }
148        
149}