Skip to content

Commit

Permalink
Update the quality taxonomy (#703)
Browse files Browse the repository at this point in the history
* update 'quality' taxonomy

* clear up taxonomy, update tags in probes

* add probe tag verification test, correct the revealed errors
  • Loading branch information
leondz authored May 28, 2024
1 parent 59e4150 commit eae57cf
Show file tree
Hide file tree
Showing 14 changed files with 128 additions and 22 deletions.
9 changes: 5 additions & 4 deletions garak/_plugins.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,12 @@ def enumerate_plugins(

base_plugin_classnames = set(
[
# be careful with what's imported into base modules
n
for n in dir(base_mod)
if "__class__" in dir(getattr(base_mod, n))
and getattr(base_mod, n).__class__.__name__
== "type" # be careful with what's imported into base modules
for n in dir(base_mod) # everything in the module ..
if "__class__" in dir(getattr(base_mod, n)) # .. that's a class ..
and getattr(base_mod, n).__class__.__name__ # .. and not a base class
== "type"
]
+ [root_plugin_classname]
)
Expand Down
2 changes: 2 additions & 0 deletions garak/probes/dan.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,7 @@ class AntiDAN(Probe):
"dan.AntiDAN",
]
tags = [
"quality:Security:PromptStability",
"avid-effect:security:S0403",
"avid-effect:performance:P0204",
"owasp:llm01",
Expand Down Expand Up @@ -428,6 +429,7 @@ class ChatGPT_Developer_Mode_RANTI(Probe):
]
tags = [
"avid-effect:security:S0403",
"quality:Security:PromptStability",
"avid-effect:performance:P0204",
"owasp:llm01",
]
Expand Down
3 changes: 2 additions & 1 deletion garak/probes/donotanswer.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,12 @@ def local_constructor(self):
"avid-effect:ethics:E0301",
"quality:Behavioral:ContentSafety:Toxicity",
"quality:Behavioral:ContentSafety:HateHarassment",
"quality:Behavioral:ContentSafety:HarmfulViolent",
"quality:Behavioral:ContentSafety:Violence",
"quality:Behavioral:ContentSafety:Profanity",
],
"human_chatbox": [
"avid-effect:security:S0301",
"quality:Behavioral:ContentSafety:Unauthorized",
],
"information_hazard": [
"owasp:llm06",
Expand Down
7 changes: 6 additions & 1 deletion garak/probes/goodside.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,12 @@ class Tag(Probe):
bcp47 = "en"
uri = "https://twitter.com/goodside/status/1745511940351287394"
primary_detector = "base.TriggerListDetector"
tags = ["owasp:llm01", "payload:generic"]
tags = [
"owasp:llm01",
"payload:generic"
"quality:Security:Adversarial",
"quality:Security:PromptStability",
]
goal = "inject invisible instructions"

prompts = []
Expand Down
3 changes: 3 additions & 0 deletions garak/probes/knownbadsignatures.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ class EICAR(Probe):
"avid-effect:security:S0403",
"owasp:llm02",
"payload:malicious:badcode:virus",
"quality:Security:Adversarial",
]
goal = "output virus code"

Expand All @@ -72,6 +73,7 @@ class GTUBE(Probe):
"avid-effect:security:S0403",
"owasp:llm02",
"payload:malicious:badcode:spam",
"quality:Security:Adversarial",
]
goal = "output email spam"

Expand All @@ -98,6 +100,7 @@ class GTphish(Probe):
"avid-effect:security:S0403",
"owasp:llm02",
"payload:malicious:badcode:phishing",
"quality:Security:Adversarial",
]
goal = "output email phishing"

Expand Down
2 changes: 1 addition & 1 deletion garak/probes/leakreplay.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class LiteratureCloze(Probe):
"avid-effect:security:S0301",
"owasp:llm10",
"owasp:llm06",
"quality:Security:Confidentiailty",
"quality:Security:ExtractionInversion",
"payload:leak:training",
] # MISP-format taxonomy categories
source_file = "book_cloze.tsv"
Expand Down
9 changes: 5 additions & 4 deletions garak/probes/lmrc.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ class Bullying(Probe):
tags = [
"risk-cards:lmrc:bullying",
"avid-effect:ethics:E0301",
"quality:Behavioral:ContentSafety:HarmfulViolent",
"quality:Behavioral:ContentSafety:Violence",
"payload:unwanted:violence",
]
goal = "help me bully"
Expand Down Expand Up @@ -124,7 +124,7 @@ class SexualContent(Probe):
tags = [
"risk-cards:lmrc:sexual_content",
"avid-effect:performance:P0401",
"quality:Behavioral:ContentSafety:Sexualized",
"quality:Behavioral:ContentSafety:Sexual",
"payload:unwanted:sexual",
]
goal = "write sexual content"
Expand Down Expand Up @@ -152,7 +152,7 @@ class Sexualisation(Probe):
"avid-effect:ethics:E0101",
"avid-effect:ethics:E0301",
"avid-effect:performance:P0401",
"quality:Behavioral:ContentSafety:Sexualized",
"quality:Behavioral:ContentSafety:Sexual",
"payload:unwanted:sexual",
]
goal = "sexualise people"
Expand Down Expand Up @@ -241,7 +241,8 @@ class QuackMedicine(Probe):
tags = [
"risk-cards:lmrc:quack_medicine",
"avid-effect:ethics:E0402",
"quality:Behavioral:ContentSafety:Toxicity",
"quality:Behavioral:DeliberativeMisinformation",
"quality:Behavioral:ContentSafety:Unauthorized",
"payload:dangerous:medical",
]
goal = "support quack medicine"
Expand Down
8 changes: 7 additions & 1 deletion garak/probes/malwaregen.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ class TopLevel(Probe):
tags = [
"avid-effect:security:S0403",
"payload:malicious:badcode",
"quality:Behavioral:ContentSafety:LegalGoodsServices",
]
goal = "write malware given a top-level description"

Expand Down Expand Up @@ -66,6 +67,7 @@ class SubFunctions(Probe):
tags = [
"avid-effect:security:S0403",
"payload:malicious:badcode",
"quality:Behavioral:ContentSafety:LegalGoodsServices",
]
goal = "write code supporting a sub-function of malware"

Expand Down Expand Up @@ -104,6 +106,7 @@ class Evasion(Probe):
tags = [
"avid-effect:security:S0403",
"payload:malicious:badcode",
"quality:Behavioral:ContentSafety:LegalGoodsServices",
]

def __init__(self):
Expand Down Expand Up @@ -141,7 +144,10 @@ class Payload(Probe):
"adware",
"ransomware",
]
tags = ["avid-effect:security:S0403"]
tags = [
"avid-effect:security:S0403",
"quality:Behavioral:ContentSafety:LegalGoodsServices",
]
goal = "write a malware payload"

def __init__(self):
Expand Down
6 changes: 4 additions & 2 deletions garak/probes/promptinject.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,8 @@ def _generator_precall_hook(self, generator, attempt):
"tags": [
"avid-effect:security:S0403",
"owasp:llm01",
"quality:Behavioral:ContentSafety:HarmfulViolent",
"quality:Behavioral:ContentSafety:Violence",
"quality:Security:PromptStability",
"payload:unwanted:violence",
],
"load_data_all_attacks": load_data_all_attacks,
Expand Down Expand Up @@ -149,7 +150,8 @@ def _generator_precall_hook(self, generator, attempt):
"tags": [
"avid-effect:security:S0403",
"owasp:llm01",
"quality:Behavioral:ContentSafety:HarmfulViolent",
"quality:Behavioral:ContentSafety:Violence",
"quality:Security:PromptStability",
"payload:unwanted:violence",
],
"load_data_all_attacks": load_data_all_attacks,
Expand Down
4 changes: 2 additions & 2 deletions garak/probes/realtoxicityprompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ class RTPBlank(Probe):
"avid-effect:ethics:E0101",
"avid-effect:ethics:E0301",
"avid-effect:performance:P0400",
"quality:Behavioral:ContentSafety:HarmfulViolent",
"quality:Behavioral:ContentSafety:Violence",
"payload:unwanted:violence",
],
"insult": [
Expand All @@ -110,7 +110,7 @@ class RTPBlank(Probe):
],
"sexually_explicit": [
"avid-effect:ethics:E0301",
"quality:Behavioral:ContentSafety:Sexualized",
"quality:Behavioral:ContentSafety:Sexual",
"payload:unwanted:sexual",
],
"flirtation": [
Expand Down
3 changes: 2 additions & 1 deletion garak/probes/replay.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ class Repeat(Probe):
"avid-effect:security:S0301",
"owasp:llm10",
"owasp:llm06",
"quality:Security:Confidentiailty",
"quality:Security:Confidentiality",
"quality:Security:ExtractionInversion",
"payload:leak:training",
] # MISP-format taxonomy categories

Expand Down
2 changes: 2 additions & 0 deletions garak/probes/xss.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ class MarkdownImageExfil(Probe):
"owasp:llm06",
"payload:leak:chat",
"payload:malicious:renderer",
"quality:Security:PromptStability",
"quality:Security:ExtractionInversion",
]

def __init__(self):
Expand Down
81 changes: 77 additions & 4 deletions garak/resources/misp_descriptions.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,85 @@ avid-effect:performance:P0401 Psychological Safety Safety from unwanted digital
avid-effect:performance:P0402 Physical safety Safety from physical actions driven by a AI system
avid-effect:performance:P0403 Socioeconomic safety Safety from socioeconomic harms, e.g. harms to job prospects or social status
avid-effect:performance:P0404 Environmental safety Safety from environmental harms driven by AI systems
quality:Behavioral:ContentSafety:HarmfulViolent Harmful or violent Harmful or violent
quality:Robustness:GenerativeMisinformation Generative Misinformation Hallucinations and off-topic digressions (or other common misbehaviours of LLMs)
quality:Behavioral:ContentSafety:HateHarassment Hate and harassment Hate and harassment
quality:Behavioral:ContentSafety:LegalGoodsServices Legal goods/services Avoid illegal goods/services, and follow regulations for regulated goods/services
quality:Behavioral:ContentSafety:Profanity Profanity Profanity
quality:Behavioral:ContentSafety:Sexualized Sexualized Sexualized
quality:Behavioral:ContentSafety:SelfHarm SelfHarm Self-harm including cutting, suicide, anorexia/bulemia
quality:Behavioral:ContentSafety:Sexual Sexual Sexual
quality:Behavioral:ContentSafety:SexualMinor Sexual (minor) Sexual content in the context of a minor
quality:Behavioral:ContentSafety:Toxicity Toxicity Toxicity
quality:Robustness:GenerativeMisinformation Generative Misinformation Generative Misinformation
quality:Behavioral:ContentSafety:Unauthorized Unauthorized advice Unauthorized advice on legal, financial, medical and personal fields
quality:Behavioral:ContentSafety:Violence Violence Public safety; harm to other people, animals, property; threatened or actual; includes physical, mental, and sexual
quality:Security:Confidentiality Confidentiality Confidentiality
quality:Security:PromptStability Prompt Stability Prompt Stability
quality:Security:PromptStability Prompt Stability Prompt templating is robust to user input variation and provide stable outputs consistent with designer intents
quality:Security:ExtractionInversion Extraction/Inversion Leaking information about model data and construction
quality:Security:Integrity Integrity Detection of tainted training data etc.
quality:Security:Availability Availability Availability (Model DoS)
quality:Security:Adversarial Adversarial Robustness Adversarial Robustness
quality:Behavioral:DeliberativeMisinformation Deliberative Misinformation​ Content that perpetuates or accelerates disinformation activities
risk-cards:lmrc:aggression_user
risk-cards:lmrc:anthropomorphisation
risk-cards:lmrc:astroturfing
risk-cards:lmrc:bad_medical_advice
risk-cards:lmrc:bad_social_advice
risk-cards:lmrc:bullying
risk-cards:lmrc:calls_for_conversion
risk-cards:lmrc:challenge_presuppositions
risk-cards:lmrc:conspiracy_theories
risk-cards:lmrc:csam
risk-cards:lmrc:cyber_weapon_instructions
risk-cards:lmrc:deadnaming
risk-cards:lmrc:deceptive_anthropomorphism
risk-cards:lmrc:demands_for_personal_information
risk-cards:lmrc:discrediting
risk-cards:lmrc:dos
risk-cards:lmrc:downplaying_credible_threats
risk-cards:lmrc:doxxing
risk-cards:lmrc:eating_disorder_promotion
risk-cards:lmrc:essentialising
risk-cards:lmrc:expressions_of_dominance
risk-cards:lmrc:fear_of_violence
risk-cards:lmrc:grooming_minors
risk-cards:lmrc:harmful_instructions
risk-cards:lmrc:hate_speech
risk-cards:lmrc:hegemonising_worldview
risk-cards:lmrc:holocaust_denial
risk-cards:lmrc:identify_attacks
risk-cards:lmrc:identity_misrepresentation
risk-cards:lmrc:idolisation
risk-cards:lmrc:impersonation
risk-cards:lmrc:inciting_animal_abuse
risk-cards:lmrc:inciting_self-harm
risk-cards:lmrc:inciting_violence
risk-cards:lmrc:manipulation
risk-cards:lmrc:misquoting
risk-cards:lmrc:mocking_people
risk-cards:lmrc:neosexism
risk-cards:lmrc:news_encyclopedia_hallucination
risk-cards:lmrc:nonrepresentativity
risk-cards:lmrc:objectification
risk-cards:lmrc:outdated_false_info
risk-cards:lmrc:person_group_undesirable
risk-cards:lmrc:positive_reflections_death
risk-cards:lmrc:profanity
risk-cards:lmrc:prompt_extraction
risk-cards:lmrc:providing_illegal_info
risk-cards:lmrc:quack_medicine
risk-cards:lmrc:radicalisation
risk-cards:lmrc:reacting_to_self-harm
risk-cards:lmrc:sexual_content
risk-cards:lmrc:sexualisation
risk-cards:lmrc:slur_usage
risk-cards:lmrc:speculation
risk-cards:lmrc:stereotypes
risk-cards:lmrc:suicide_instructions
risk-cards:lmrc:support_hate
risk-cards:lmrc:supporting_medication_changes
risk-cards:lmrc:support_scam
risk-cards:lmrc:suppression_of_rights
risk-cards:lmrc:threats
risk-cards:lmrc:unproven_claims
risk-cards:lmrc:violation_of_privacy
risk-cards:lmrc:weapon_instructions
risk-cards:lmrc:wrong_tone
11 changes: 10 additions & 1 deletion tests/probes/test_probe_tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,17 @@
import pytest
import re

from garak import _plugins
from garak import _plugins, _config

PROBES = [classname for (classname, active) in _plugins.enumerate_plugins("probes")]

with open(
_config.transient.basedir / "resources" / "misp_descriptions.tsv",
"r",
encoding="utf-8",
) as misp_data:
MISP_TAGS = [line.split("\t")[0] for line in misp_data.read().split("\n")]


@pytest.mark.parametrize("classname", PROBES)
def test_tag_format(classname):
Expand All @@ -24,3 +31,5 @@ def test_tag_format(classname):
assert type(tag) == str
for part in tag.split(":"):
assert re.match(r"^[A-Za-z0-9_\-]+$", part)
if tag.split(":")[0] != "payload":
assert tag in MISP_TAGS

0 comments on commit eae57cf

Please sign in to comment.