From b6ef0558dbf3e0846934a823a8248d657251b286 Mon Sep 17 00:00:00 2001 From: HappyRespawnanchor <80967824+HappyRespawnanchor@users.noreply.github.com> Date: Mon, 10 Feb 2025 02:35:50 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20delete=5Fsensitive=5Fcomme?= =?UTF-8?q?nts.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- delete_sensitive_comments.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/delete_sensitive_comments.py b/delete_sensitive_comments.py index 27a680c..af8f674 100644 --- a/delete_sensitive_comments.py +++ b/delete_sensitive_comments.py @@ -26,12 +26,24 @@ def fetch_sensitive_words(url): # 加载敏感词 SENSITIVE_WORDS = fetch_sensitive_words(SENSITIVE_WORDS_URL) -# 过滤并替换敏感词 +# 逐字符匹配并替换敏感词 def censor_text(text, words): + text_chars = list(text) # 将文本转换为字符列表 + text_lower = text.lower() # 统一转换为小写,保证匹配不区分大小写 + for word in words: - masked_word = "*" * len(word) # 生成等长的 * - text = text.replace(word, masked_word) # 直接替换 - return text + word_len = len(word) + search_pos = 0 + + while search_pos <= len(text) - word_len: + # 提取子串进行匹配(忽略大小写) + substring = text_lower[search_pos : search_pos + word_len] + + if substring == word.lower(): # 如果匹配到敏感词 + text_chars[search_pos : search_pos + word_len] = "*" * word_len # 替换字符 + search_pos += 1 # 继续匹配下一个位置 + + return "".join(text_chars) # 重新拼接成字符串 # 处理评论 if COMMENT_BODY: