Enabled flash attention on nvidia GPUs by default and adjusted defaul…

…ts to reduce memory requirements
curvedinf · Aug 6, 2024 · d41e39f · d41e39f
1 parent 92975d7
commit d41e39f
Show file tree

Hide file tree

Showing 4 changed files with 10 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -30,7 +30,8 @@ In this section are recipes to run `dir-assistant` in basic capacity to get you
 ### Quickstart with Local Default Model (Phi 3 128k)
 
 To get started locally, you can download a default llm model. Default configuration with this model requires 
-14GB of memory, but you will be able to adjust the configuration to fit lower memory requirements. To run via CPU:
+11GB of memory on most hardware or 8GB on nvidia GPUs due to flash attention availability (enabled by default). 
+You will be able to adjust the configuration to fit higher or lower memory requirements. To run via CPU:
 
 ```shell
 pip install dir-assistant
@@ -221,7 +222,7 @@ The most important `llama-cpp-python` options are related to tuning the LLM to y
 file text that can be included when running a prompt.
 * `CONTEXT_FILE_RATIO` sets the proportion of prompt history to file text to be included when sent to the LLM. 
 Higher ratios mean more file text and less prompt history. More file text generally improves comprehension.
-* If your llm `n_ctx` is smaller than your embed `n_ctx` times `CONTEXT_FILE_RATIO`, your file text chunks 
+* If your llm `n_ctx` times `CONTEXT_FILE_RATIO` is smaller than your embed `n_ctx`, your file text chunks 
 have the potential to be larger than your llm context, and thus will not be included. To ensure all files 
 can be included, make sure your embed context is smaller than `n_ctx` times `CONTEXT_FILE_RATIO`.
 * Larger embed `n_ctx` will chunk your files into larger sizes, which allows LLMs to understand them more

diff --git a/dir_assistant/config.py b/dir_assistant/config.py
@@ -26,7 +26,7 @@
     'EMBED_MODEL': '',
     'LLM_MODEL': '',
     'LLAMA_CPP_OPTIONS': {
-        'n_ctx': 12000,
+        'n_ctx': 9200,
         'verbose': False,
     },
     'LLAMA_CPP_EMBED_OPTIONS': {

diff --git a/dir_assistant/index.py b/dir_assistant/index.py
@@ -153,7 +153,7 @@ def find_split_point(embed, line_content, max_size, header):
 
 def search_index(embed, index, query, all_chunks):
     query_embedding = embed.create_embedding([query])['data'][0]['embedding']
-    distances, indices = index.search(np.array([query_embedding]), 100)
+    distances, indices = index.search(np.array([query_embedding]), 100) # 819,200 tokens max with default embedding
     relevant_chunks = [all_chunks[i] for i in indices[0] if i != -1]
     return relevant_chunks
 

diff --git a/dir_assistant/platform_setup.py b/dir_assistant/platform_setup.py
@@ -5,11 +5,13 @@
 
 def platform(args, config_dict):
     is_cpu = False
+    is_cuda = False
     cmake_args = ''
     if args.selection.lower() == 'cpu':
         is_cpu = True
     elif args.selection.lower() == 'cuda':
         cmake_args = '-DGGML_CUDA=on'
+        is_cuda = True
     elif args.selection.lower() == 'rocm':
         cmake_args = '-DGGML_HIPBLAS=ON'
     elif args.selection.lower() == 'metal':
@@ -32,6 +34,9 @@ def platform(args, config_dict):
     else:
         config_dict['LLAMA_CPP_OPTIONS']['n_gpu_layers'] = -1
         config_dict['LLAMA_CPP_EMBED_OPTIONS']['n_gpu_layers'] = -1
+        if is_cuda:
+            config_dict['LLAMA_CPP_OPTIONS']['flash_attn'] = True
+            config_dict['LLAMA_CPP_EMBED_OPTIONS']['flash_attn'] = True
 
     save_config({'DIR_ASSISTANT': config_dict})