Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into cline
Browse files Browse the repository at this point in the history
  • Loading branch information
lukehinds committed Jan 3, 2025
2 parents 647f012 + 8b95d7f commit ebd5b80
Show file tree
Hide file tree
Showing 18 changed files with 248 additions and 177 deletions.
2 changes: 1 addition & 1 deletion .github/ISSUE_TEMPLATE/bug_report.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: "Bug Report"
description: "Report a bug to help us improve the proxy system."
title: "[Bug]: Provide a general summary of the issue"
title: "-- Provide a general summary of the issue --"
labels: [bug]
assignees: "-"
body:
Expand Down
10 changes: 4 additions & 6 deletions .github/workflows/image-publish.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
name: Publish Docker Image
on:
schedule:
# Once weekly on Fridays at noon
- cron: "00 12 * * 5"
release:
types:
- published
workflow_dispatch:

jobs:
Expand Down Expand Up @@ -59,16 +59,14 @@ jobs:
github_token: ${{ github.token }}
workflow: ".github/workflows/import_packages.yml"
workflow_conclusion: success
name: backup_weaviate
name: sqlite_data
name_is_regexp: true
skip_unpack: false
if_no_artifact_found: ignore
- name: Fetch latest FE commit SHA
id: fetch_commit_fe_sha
run: |
echo "LATEST_RELEASE=$(curl -s "https://api.github.com/repos/stacklok/codegate-ui/releases/latest" -H "Authorization: Bearer ${{ secrets.GH_CI_TOKEN }}" | grep '"zipball_url":' | cut -d '"' -f 4)" >> $GITHUB_ENV
- name: Rename to accommodate to image
run: mv ./backup_weaviate ./weaviate_backup
- name: Download git lfs dependencies
run: |
git lfs install
Expand Down
36 changes: 17 additions & 19 deletions .github/workflows/import_packages.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,33 +51,31 @@ jobs:
github_token: ${{ github.token }}
workflow: ".github/workflows/import_packages.yml"
workflow_conclusion: success
name: backup_weaviate
name: sqlite_data
path: /tmp/
name_is_regexp: true
skip_unpack: false
if_no_artifact_found: ignore
if_no_artifact_found: ignore

- name: Create folder if artifact download is not enabled
if: ${{ github.event.inputs.enable_artifact_download == 'false' }}
- name: Install Poetry
run: |
mkdir -p /tmp/backup_weaviate
echo "Folder ./backup_weaviate created because artifact download is disabled."
curl -sSL https://install.python-poetry.org | python3 -
- name: Add Poetry to PATH
run: |
echo "PATH=$HOME/.poetry/bin:$PATH" >> $GITHUB_ENV
- name: Install dependencies with Poetry
run: |
poetry install
- name: Run sync
- name: 'Run import_packages.py with poetry'
run: |
export PYTHONPATH=$PYTHONPATH:./
export BACKUP_FILESYSTEM_PATH=/tmp/backup_weaviate/
export BACKUP_FOLDER=backup
# Conditionally export the variables only if artifact download is enabled
if [ "${{ github.event.inputs.enable_artifact_download }}" == "true" ]; then
python scripts/import_packages.py --jsonl-dir /tmp/jsonl-files/
else
python scripts/import_packages.py --restore-backup False --jsonl-dir /tmp/jsonl-files/
fi
poetry run python scripts/import_packages.py --jsonl-dir /tmp/jsonl-files --vec-db-path /tmp/sqlite_data/vectordb.db
- name: 'Upload Backup Files'
- name: 'Upload SQLite Vector DB File'
uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4
with:
name: backup_weaviate
path: /tmp/backup_weaviate/backup*
name: sqlite_data
path: /tmp/sqlite_data/vectordb.db
retention-days: 90
5 changes: 0 additions & 5 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
# Create a non-root user
RUN useradd -m -u 1000 -r codegate

# Copy backup if needed
RUN mkdir -p /tmp/weaviate_backup
# will not fail if the file does not exist
COPY weaviate_backu[p] /tmp/weaviate_backup
RUN chown -R codegate /tmp/weaviate_backup

# Set permissions for user codegate to run nginx
RUN chown -R codegate /var/lib/nginx && \
Expand Down
226 changes: 123 additions & 103 deletions poetry.lock

Large diffs are not rendered by default.

9 changes: 5 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,23 @@ PyYAML = ">=6.0.1"
fastapi = ">=0.115.5"
uvicorn = ">=0.32.1"
structlog = ">=24.4.0"
litellm = "^1.55.11"
litellm = "^1.56.8"
llama_cpp_python = ">=0.3.2"
cryptography = "^44.0.0"
sqlalchemy = "^2.0.28"
greenlet = "^3.0.3"
aiosqlite = "^0.20.0"
ollama = ">=0.4.4"
pydantic-settings = "^2.7.0"
sqlite-vec = ">=0.1.0"
pydantic-settings = "^2.7.1"
numpy = ">=1.24.0"
tree-sitter = ">=0.23.2"
tree-sitter-go = ">=0.23.4"
tree-sitter-java = ">=0.23.5"
tree-sitter-javascript = ">=0.23.1"
tree-sitter-python = ">=0.23.6"
tree-sitter-rust = ">=0.23.2"

sqlite-vec-sl-tmp = "^0.0.4"
[tool.poetry.group.dev.dependencies]
pytest = ">=7.4.0"
pytest-cov = ">=4.1.0"
Expand All @@ -37,7 +38,7 @@ bandit = ">=1.7.10"
build = ">=1.0.0"
wheel = ">=0.40.0"
litellm = ">=1.52.11"
pytest-asyncio = "0.25.0"
pytest-asyncio = "0.25.1"
llama_cpp_python = ">=0.3.2"
scikit-learn = ">=1.6.0"
python-dotenv = ">=1.0.1"
Expand Down
4 changes: 2 additions & 2 deletions scripts/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
#!/bin/bash

# those are hardcoded on the image, will not change
BACKUP_PATH="/tmp/weaviate_backup"
BACKUP_NAME="backup"
MODEL_BASE_PATH="/app/codegate_volume/models"
CODEGATE_DB_FILE="/app/codegate_volume/db/codegate.db"
CODEGATE_VEC_DB_FILE="/app/sqlite_data/vectordb.db"
CODEGATE_CERTS="/app/codegate_volume/certs"

# Function to restore backup if paths are provided
Expand Down Expand Up @@ -37,7 +37,7 @@ start_application() {
# first restore the models
mkdir -p /app/codegate_volume/models
cp /app/default_models/* /app/codegate_volume/models
CMD_ARGS="--port 8989 --host 0.0.0.0 --model-base-path $MODEL_BASE_PATH --db-path $CODEGATE_DB_FILE"
CMD_ARGS="--port 8989 --host 0.0.0.0 --model-base-path $MODEL_BASE_PATH --db-path $CODEGATE_DB_FILE --vec-db-path $CODEGATE_VEC_DB_FILE"

# Check and append additional URLs if they are set
[ -n "$CODEGATE_OPENAI_URL" ] && CMD_ARGS+=" --openai-url $CODEGATE_OPENAI_URL"
Expand Down
16 changes: 8 additions & 8 deletions scripts/import_packages.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,16 @@
import sqlite3

import numpy as np
import sqlite_vec
import sqlite_vec_sl_tmp

from codegate.inference.inference_engine import LlamaCppInferenceEngine
from codegate.utils.utils import generate_vector_string


class PackageImporter:
def __init__(self, jsonl_dir="data", db_path="./sqlite_data/vectordb.db"):
os.makedirs(os.path.dirname(db_path), exist_ok=True)
self.db_path = db_path
def __init__(self, jsonl_dir="data", vec_db_path="./sqlite_data/vectordb.db"):
os.makedirs(os.path.dirname(vec_db_path), exist_ok=True)
self.vec_db_path = vec_db_path
self.json_files = [
os.path.join(jsonl_dir, "archived.jsonl"),
os.path.join(jsonl_dir, "deprecated.jsonl"),
Expand All @@ -25,9 +25,9 @@ def __init__(self, jsonl_dir="data", db_path="./sqlite_data/vectordb.db"):
self.model_path = "./codegate_volume/models/all-minilm-L6-v2-q5_k_m.gguf"

def _get_connection(self):
conn = sqlite3.connect(self.db_path)
conn = sqlite3.connect(self.vec_db_path)
conn.enable_load_extension(True)
sqlite_vec.load(conn)
sqlite_vec_sl_tmp.load(conn)
conn.enable_load_extension(False)
return conn

Expand Down Expand Up @@ -129,12 +129,12 @@ def __del__(self):
help="Directory containing JSONL files. Default is 'data'.",
)
parser.add_argument(
"--db-path",
"--vec-db-path",
type=str,
default="./sqlite_data/vectordb.db",
help="Path to SQLite database file. Default is './sqlite_data/vectordb.db'.",
)
args = parser.parse_args()

importer = PackageImporter(jsonl_dir=args.jsonl_dir, db_path=args.db_path)
importer = PackageImporter(jsonl_dir=args.jsonl_dir, vec_db_path=args.vec_db_path)
asyncio.run(importer.run_import())
2 changes: 1 addition & 1 deletion src/codegate/ca/codegate_ca.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def _load_existing_certificates(self) -> None:
self._cert_cache[common_name] = CachedCertificate(
cert_path=cert_path,
key_path=key_path,
creation_time=datetime.utcnow(),
creation_time=datetime.now(datetime.UTC),
)
else:
logger.debug(f"Skipping expired certificate for {common_name}")
Expand Down
1 change: 1 addition & 0 deletions src/codegate/codegate_logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ def setup_logging(
# Set explicitly the log level for other modules
logging.getLogger("sqlalchemy").disabled = True
logging.getLogger("uvicorn.error").disabled = True
logging.getLogger("aiosqlite").disabled = True

# Create a logger for our package
logger = structlog.get_logger("codegate")
Expand Down
42 changes: 33 additions & 9 deletions src/codegate/dashboard/post_processing.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import asyncio
import json
import re
from typing import List, Optional, Tuple, Union

import structlog
Expand Down Expand Up @@ -180,6 +181,20 @@ async def parse_get_prompt_with_output(
)


def parse_question_answer(input_text: str) -> str:
# given a string, detect if we have a pattern of "Context: xxx \n\nQuery: xxx" and strip it
pattern = r'^Context:.*?\n\n\s*Query:\s*(.*)$'

# Search using the regex pattern
match = re.search(pattern, input_text, re.DOTALL)

# If a match is found, return the captured group after "Query:"
if match:
return match.group(1)
else:
return input_text


async def match_conversations(
partial_conversations: List[Optional[PartialConversation]],
) -> List[Conversation]:
Expand All @@ -205,17 +220,26 @@ async def match_conversations(
conversations = []
for chat_id, sorted_convers in sorted_convers.items():
questions_answers = []
first_partial_conversation = None
for partial_conversation in sorted_convers:
questions_answers.append(partial_conversation.question_answer)
conversations.append(
Conversation(
question_answers=questions_answers,
provider=partial_conversation.provider,
type=partial_conversation.type,
chat_id=chat_id,
conversation_timestamp=sorted_convers[0].request_timestamp,
# check if we have an answer, otherwise do not add it
if partial_conversation.question_answer.answer is not None:
first_partial_conversation = partial_conversation
partial_conversation.question_answer.question.message = parse_question_answer(
partial_conversation.question_answer.question.message)
questions_answers.append(partial_conversation.question_answer)

# only add conversation if we have some answers
if len(questions_answers) > 0 and first_partial_conversation is not None:
conversations.append(
Conversation(
question_answers=questions_answers,
provider=first_partial_conversation.provider,
type=first_partial_conversation.type,
chat_id=chat_id,
conversation_timestamp=sorted_convers[0].request_timestamp,
)
)
)

return conversations

Expand Down
7 changes: 4 additions & 3 deletions src/codegate/db/connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,8 @@ async def record_outputs(self, outputs: List[Output]) -> Optional[Output]:
"""
)
recorded_output = await self._insert_pydantic_model(output_db, sql)
logger.debug(f"Recorded output: {recorded_output}")
# Uncomment to debug
# logger.debug(f"Recorded output: {recorded_output}")
return recorded_output

async def record_alerts(self, alerts: List[Alert]) -> List[Alert]:
Expand Down Expand Up @@ -177,8 +178,8 @@ async def record_alerts(self, alerts: List[Alert]) -> List[Alert]:
recorded_alerts.append(alert_result)
if alert_result and alert_result.trigger_category == "critical":
await alert_queue.put(f"New alert detected: {alert.timestamp}")

logger.debug(f"Recorded alerts: {recorded_alerts}")
# Uncomment to debug the recorded alerts
# logger.debug(f"Recorded alerts: {recorded_alerts}")
return recorded_alerts

def _should_record_context(self, context: Optional[PipelineContext]) -> bool:
Expand Down
6 changes: 4 additions & 2 deletions src/codegate/pipeline/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,8 @@ def add_alert(
timestamp=datetime.datetime.now(datetime.timezone.utc),
)
)
logger.debug(f"Added alert to context: {self.alerts_raised[-1]}")
# Uncomment the below to debug
# logger.debug(f"Added alert to context: {self.alerts_raised[-1]}")

def add_input_request(
self, normalized_request: ChatCompletionRequest, is_fim_request: bool, provider: str
Expand Down Expand Up @@ -159,7 +160,8 @@ def add_output(self, model_response: ModelResponse) -> None:
output=output_str,
)
)
logger.debug(f"Added output to context: {self.output_responses[-1]}")
# Uncomment the below to debug the responses
# logger.debug(f"Added output to context: {self.output_responses[-1]}")
except Exception as e:
logger.error(f"Failed to serialize output: {model_response}", error=str(e))
return
Expand Down
21 changes: 11 additions & 10 deletions src/codegate/storage/storage_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from typing import List

import numpy as np
import sqlite_vec
import sqlite_vec_sl_tmp
import structlog

from codegate.config import Config
Expand Down Expand Up @@ -62,7 +62,7 @@ def _get_connection(self):
try:
conn = sqlite3.connect(self.db_path)
conn.enable_load_extension(True)
sqlite_vec.load(conn)
sqlite_vec_sl_tmp.load(conn)
conn.enable_load_extension(False)
return conn
except Exception as e:
Expand Down Expand Up @@ -200,14 +200,15 @@ async def search(

# Log the raw SQL results
rows = cursor.fetchall()
logger.debug(
"Raw SQL results",
row_count=len(rows),
rows=[
{"name": row[0], "type": row[1], "status": row[2], "description": row[3]}
for row in rows
],
)
# Uncomment the following lines to log
# logger.debug(
# "Raw SQL results",
# row_count=len(rows),
# rows=[
# {"name": row[0], "type": row[1], "status": row[2], "description": row[3]}
# for row in rows
# ],
# )

results = []
query_words = None
Expand Down
Loading

0 comments on commit ebd5b80

Please sign in to comment.