Skip to content

Commit

Permalink
I added some json repairs that helped me with malformed messages (#341)
Browse files Browse the repository at this point in the history
* I added some json repairs that helped me with malformed messages

There are two of them: The first will remove hard line feeds that appear
in the message part because the model added those instead of escaped
line feeds. This happens a lot in my experiments and that actually fixes
them.

The second one is less tested and should handle the case that the model
answers with multiple blocks of strings in quotes or even uses unescaped
quotes. It should grab everything betwenn the message: " and the ending
curly braces, escape them and makes it propper json that way.

Disclaimer: Both function were written with the help of ChatGPT-4 (I
can't write much Python). I think the first one is quite solid but doubt
that the second one is fully working. Maybe somebody with more Python
skills than me (or with more time) has a better idea for this type of
malformed replies.

* Moved the repair output behind the debug flag and removed the "clean" one

* Added even more fixes (out of what I just encountered while testing)

It seems that cut of json can be corrected and sometimes the model is to
lazy to add not just one curly brace but two. I think it does not "cost"
a lot to try them all out. But the expeptions get massive that way :)

* black

* for the final hail mary with extract_first_json, might as well add a double end bracket instead of single

---------

Co-authored-by: cpacker <[email protected]>
  • Loading branch information
oderwat and cpacker authored Nov 10, 2023
1 parent 34e6371 commit f4a2047
Show file tree
Hide file tree
Showing 2 changed files with 165 additions and 5 deletions.
106 changes: 101 additions & 5 deletions memgpt/local_llm/json_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,18 +46,114 @@ def add_missing_heartbeat(llm_json):
raise NotImplementedError


def repair_json_string(json_string):
"""
This function repairs a JSON string where line feeds were accidentally added
within string literals. The line feeds are replaced with the escaped line
feed sequence '\\n'.
"""
new_string = ""
in_string = False
escape = False

for char in json_string:
if char == '"' and not escape:
in_string = not in_string
if char == "\\" and not escape:
escape = True
else:
escape = False
if char == "\n" and in_string:
new_string += "\\n"
else:
new_string += char

return new_string


def repair_even_worse_json(json_string):
"""
This function repairs a malformed JSON string where string literals are broken up and
not properly enclosed in quotes. It aims to consolidate everything between 'message': and
the two ending curly braces into one string for the 'message' field.
"""
# State flags
in_message = False
in_string = False
escape = False
message_content = []

# Storage for the new JSON
new_json_parts = []

# Iterating through each character
for char in json_string:
if char == '"' and not escape:
in_string = not in_string
if not in_message:
# If we encounter a quote and are not in message, append normally
new_json_parts.append(char)
elif char == "\\" and not escape:
escape = True
new_json_parts.append(char)
else:
if escape:
escape = False
if in_message:
if char == "}":
# Append the consolidated message and the closing characters then reset the flag
new_json_parts.append('"{}"'.format("".join(message_content).replace("\n", " ")))
new_json_parts.append(char)
in_message = False
elif in_string or char.isalnum() or char.isspace() or char in ".',;:!":
# Collect the message content, excluding structural characters
message_content.append(char)
else:
# If we're not in message mode, append character to the output as is
new_json_parts.append(char)
if '"message":' in "".join(new_json_parts[-10:]):
# If we detect "message": pattern, switch to message mode
in_message = True
message_content = []

# Joining everything to form the new JSON
repaired_json = "".join(new_json_parts)
return repaired_json


def clean_json(raw_llm_output, messages=None, functions=None):
"""Try a bunch of hacks to parse the data coming out of the LLM"""
from memgpt.utils import printd

"""Try a bunch of hacks to parse the data coming out of the LLM"""
try:
# printd("clean json runs:", raw_llm_output)
data = json.loads(raw_llm_output)
except json.JSONDecodeError:
try:
printd("trying adding }")
data = json.loads(raw_llm_output + "}")
except json.JSONDecodeError:
try:
data = extract_first_json(raw_llm_output + "}")
except:
raise

printd("trying adding }}")
data = json.loads(raw_llm_output + "}}")
except json.JSONDecodeError:
try:
printd('trying adding "}}')
data = json.loads(raw_llm_output + '"}}')
except json.JSONDecodeError:
try:
repaired = repair_json_string(raw_llm_output)
printd("trying repair_json_string:", repaired)
data = json.loads(repaired)
except json.JSONDecodeError:
try:
repaired = repair_even_worse_json(raw_llm_output)
printd("trying repair_even_worse_json:", repaired)
data = json.loads(repaired)
except json.JSONDecodeError:
try:
printd("trying first_json")
data = extract_first_json(raw_llm_output + "}}")
except:
raise
return data
64 changes: 64 additions & 0 deletions tests/test_json_parsers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import json

import memgpt.local_llm.json_parser as json_parser


EXAMPLE_MISSING_CLOSING_BRACE = """{
"function": "send_message",
"params": {
"inner_thoughts": "Oops, I got their name wrong! I should apologize and correct myself.",
"message": "Sorry about that! I assumed you were Chad. Welcome, Brad! "
}
"""

EXAMPLE_BAD_TOKEN_END = """{
"function": "send_message",
"params": {
"inner_thoughts": "Oops, I got their name wrong! I should apologize and correct myself.",
"message": "Sorry about that! I assumed you were Chad. Welcome, Brad! "
}
}<|>"""

EXAMPLE_DOUBLE_JSON = """{
"function": "core_memory_append",
"params": {
"name": "human",
"content": "Brad, 42 years old, from Germany."
}
}
{
"function": "send_message",
"params": {
"message": "Got it! Your age and nationality are now saved in my memory."
}
}
"""

EXAMPLE_HARD_LINE_FEEDS = """{
"function": "send_message",
"params": {
"message": "Let's create a list:
- First, we can do X
- Then, we can do Y!
- Lastly, we can do Z :)"
}
}
"""


def test_json_parsers():
"""Try various broken JSON and check that the parsers can fix it"""

test_strings = [EXAMPLE_MISSING_CLOSING_BRACE, EXAMPLE_BAD_TOKEN_END, EXAMPLE_DOUBLE_JSON, EXAMPLE_HARD_LINE_FEEDS]

for string in test_strings:
try:
json.loads(string)
assert False, f"Test JSON string should have failed basic JSON parsing:\n{string}"
except:
print("String failed (expectedly)")
try:
json_parser.clean_json(string)
except:
f"Failed to repair test JSON string:\n{string}"
raise

0 comments on commit f4a2047

Please sign in to comment.