diff --git a/package-lock.json b/package-lock.json index 692ee5da..6379e404 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9,6 +9,7 @@ "version": "0.9.0", "license": "GPL-3.0-only", "dependencies": { + "@anthropic-ai/sdk": "0.33.1", "@attestate/crawler": "0.6.2", "@attestate/crawler-call-block-logs": "0.5.0", "@attestate/delegator2": "0.5.1", @@ -150,6 +151,20 @@ "npm": ">=7.0.0" } }, + "node_modules/@anthropic-ai/sdk": { + "version": "0.33.1", + "resolved": "https://registry.npmjs.org/@anthropic-ai/sdk/-/sdk-0.33.1.tgz", + "integrity": "sha512-VrlbxiAdVRGuKP2UQlCnsShDHJKWepzvfRCkZMpU+oaUdKLpOfmylLMRojGrAgebV+kDtPjewCVP0laHXg+vsA==", + "dependencies": { + "@types/node": "^18.11.18", + "@types/node-fetch": "^2.6.4", + "abort-controller": "^3.0.0", + "agentkeepalive": "^4.2.1", + "form-data-encoder": "1.7.2", + "formdata-node": "^4.3.2", + "node-fetch": "^2.6.7" + } + }, "node_modules/@attestate/crawler": { "version": "0.6.2", "resolved": "https://registry.npmjs.org/@attestate/crawler/-/crawler-0.6.2.tgz", @@ -2780,8 +2795,21 @@ "license": "MIT" }, "node_modules/@types/node": { - "version": "17.0.2", - "license": "MIT" + "version": "18.19.71", + "resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.71.tgz", + "integrity": "sha512-evXpcgtZm8FY4jqBSN8+DmOTcVkkvTmAayeo4Wf3m1xAruyVGzGuDh/Fb/WWX2yLItUiho42ozyJjB0dw//Tkw==", + "dependencies": { + "undici-types": "~5.26.4" + } + }, + "node_modules/@types/node-fetch": { + "version": "2.6.12", + "resolved": "https://registry.npmjs.org/@types/node-fetch/-/node-fetch-2.6.12.tgz", + "integrity": "sha512-8nneRWKCg3rMtF69nLQJnOYUcbafYeFSjqkw3jCRLsqkWFlHaoQrr5mXmofFGOx3DKn7UfmBMyov8ySvLRVldA==", + "dependencies": { + "@types/node": "*", + "form-data": "^4.0.0" + } }, "node_modules/@types/normalize-package-data": { "version": "2.4.1", @@ -2876,6 +2904,17 @@ "node": ">= 6.0.0" } }, + "node_modules/agentkeepalive": { + "version": "4.6.0", + "resolved": "https://registry.npmjs.org/agentkeepalive/-/agentkeepalive-4.6.0.tgz", + "integrity": "sha512-kja8j7PjmncONqaTsB8fQ+wE2mSU2DJ9D4XKoJ5PFWIdRMa6SLSN1ff4mOr4jCbfRSsxR4keIiySJU0N9T5hIQ==", + "dependencies": { + "humanize-ms": "^1.2.1" + }, + "engines": { + "node": ">= 8.0.0" + } + }, "node_modules/aggregate-error": { "version": "3.1.0", "license": "MIT", @@ -5237,6 +5276,23 @@ "node": ">= 6" } }, + "node_modules/form-data-encoder": { + "version": "1.7.2", + "resolved": "https://registry.npmjs.org/form-data-encoder/-/form-data-encoder-1.7.2.tgz", + "integrity": "sha512-qfqtYan3rxrnCk1VYaA4H+Ms9xdpPqvLZa6xmMgFvhO32x7/3J/ExcTd6qpxM0vH2GdMI+poehyBZvqfMTto8A==" + }, + "node_modules/formdata-node": { + "version": "4.4.1", + "resolved": "https://registry.npmjs.org/formdata-node/-/formdata-node-4.4.1.tgz", + "integrity": "sha512-0iirZp3uVDjVGt9p49aTaqjk84TrglENEDuqfdlZQ1roC9CWlPk6Avf8EEnZNcAqPonwkG35x4n3ww/1THYAeQ==", + "dependencies": { + "node-domexception": "1.0.0", + "web-streams-polyfill": "4.0.0-beta.3" + }, + "engines": { + "node": ">= 12.20" + } + }, "node_modules/forwarded": { "version": "0.2.0", "license": "MIT", @@ -5709,6 +5765,14 @@ "node": ">=10.17.0" } }, + "node_modules/humanize-ms": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/humanize-ms/-/humanize-ms-1.2.1.tgz", + "integrity": "sha512-Fl70vYtsAFb/C06PTS9dZBo7ihau+Tu/DNCk/OyHhea07S+aeMWpFFkUaXRa8fI+ScZbEI8dfSxwY7gxZ9SAVQ==", + "dependencies": { + "ms": "^2.0.0" + } + }, "node_modules/iconv-lite": { "version": "0.4.24", "license": "MIT", @@ -7351,6 +7415,24 @@ "node": ">=0.8" } }, + "node_modules/node-domexception": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/node-domexception/-/node-domexception-1.0.0.tgz", + "integrity": "sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/jimmywarting" + }, + { + "type": "github", + "url": "https://paypal.me/jimmywarting" + } + ], + "engines": { + "node": ">=10.5.0" + } + }, "node_modules/node-fetch": { "version": "2.6.7", "license": "MIT", @@ -10116,6 +10198,11 @@ "version": "1.12.1", "license": "MIT" }, + "node_modules/undici-types": { + "version": "5.26.5", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", + "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==" + }, "node_modules/unicode-trie": { "version": "2.0.0", "license": "MIT", @@ -10422,6 +10509,14 @@ "node": ">= 14" } }, + "node_modules/web-streams-polyfill": { + "version": "4.0.0-beta.3", + "resolved": "https://registry.npmjs.org/web-streams-polyfill/-/web-streams-polyfill-4.0.0-beta.3.tgz", + "integrity": "sha512-QW95TCTaHmsYfHDybGMwO5IJIM93I/6vTRk+daHTWFPhwh+C8Cg7j7XyKrwrj8Ib6vYXe0ocYNrmzY4xAAN6ug==", + "engines": { + "node": ">= 14" + } + }, "node_modules/web-worker": { "version": "1.2.0", "license": "Apache-2.0" diff --git a/package.json b/package.json index 2c6268f9..b8d13467 100644 --- a/package.json +++ b/package.json @@ -41,6 +41,7 @@ }, "homepage": "https://github.com/attestate/kiwistand#readme", "dependencies": { + "@anthropic-ai/sdk": "0.33.1", "@attestate/crawler": "0.6.2", "@attestate/crawler-call-block-logs": "0.5.0", "@attestate/delegator2": "0.5.1", diff --git a/src/parser.mjs b/src/parser.mjs index a3f176d5..86d9e4b6 100644 --- a/src/parser.mjs +++ b/src/parser.mjs @@ -8,6 +8,7 @@ import vhtml from "vhtml"; import { parse as parser } from "node-html-parser"; import { fetchBuilder, FileSystemCache } from "node-fetch-cache"; import { useAgent } from "request-filtering-agent"; +import Anthropic from "@anthropic-ai/sdk"; import cache from "./cache.mjs"; import log from "./logger.mjs"; @@ -28,6 +29,140 @@ const filtered = [ "instagram.com", ]; +const anthropic = new Anthropic({ + apiKey: env.ANTHROPIC_API_KEY, +}); +const twitterFrontends = [ + "xcancel.com", + "nitter.privacydev.net", + "nitter.poast.org", + "nitter.lucabased.xyz", + "nitter.kavin.rocks", + "nitter.tiekoetter.com", + "nitter.qwik.space", + "bird.habedieeh.re", + "nitter.lunar.icu", + "nitter.moomoo.me", + "nitter.kylrth.com", + "nitter.io.lol", + "nitter.rawbit.ninja", + "nitter.holo-mix.com", + "twitter.com", + "x.com", +]; +const CLAUDE_DOMAINS = ["warpcast.com", "fxtwitter.com", ...twitterFrontends]; + +const GUIDELINES = `We have an opportunity to build our own corner of the onchain internet. With awesome people, links, resources, and learning. + +Our content focuses on: +- Technical resources, hacking, and awesome git repos +- Dune dashboards, reports, data-driven articles +- Startups, cryptocurrencies, cryptography +- Networking, privacy, decentralization +- Hardware, open source, art, economics, game theory +- Anything else our community finds fascinating, from philosophy through science to infrastructure + +Title Guidelines: +- Maximum 80 characters +- Use sentence case instead of title case +- Must be clear and descriptive +- No sensationalist journalism or clickbait +- No overly optimized headlines +- No cliffhanger headlines +- No fluff headlines +- No embellishing +- Must tell exactly what to expect +- If an article has a good original title, use that +- If the original title is too long, trim it while keeping the substance +- Avoid pay-walled article titles unless highly relevant +- For technical content, be specific about the technology/protocol involved +- For crypto content, mention relevant chains/protocols where appropriate`; + +async function generateClaudeTitle(content) { + let response; + try { + response = await anthropic.messages.create({ + model: "claude-3-5-haiku-20241022", + max_tokens: 100, + temperature: 0, + tools: [ + { + name: "generate_title", + description: + "Generate a title following the provided guidelines for our Web3/crypto hacker news platform.", + input_schema: { + type: "object", + properties: { + title: { + type: "string", + description: + "The generated title that follows all provided guidelines", + }, + }, + required: ["title"], + }, + }, + ], + tool_choice: { type: "tool", name: "generate_title" }, + messages: [ + { + role: "user", + content: `Here are our submission guidelines:\n\n${GUIDELINES}\n\nBased on these guidelines, generate a title for this content:\n\n${content}`, + }, + ], + }); + } catch (error) { + console.error("Claude API request failed:", error); + return null; + } + + try { + const toolUse = response.content.find((c) => c.type === "tool_use"); + if (!toolUse?.input?.title) { + console.error("No title found in Claude response"); + return null; + } + return toolUse.input.title; + } catch (error) { + console.error("Error extracting title from response:", error); + return null; + } +} + +async function extractWarpcastContent(url) { + try { + const apiUrl = `https://api.neynar.com/v2/farcaster/cast?identifier=${url}&type=url`; + + const response = await fetch(apiUrl, { + headers: { + accept: "application/json", + "X-Api-Key": "NEYNAR_API_DOCS", + "User-Agent": + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", + }, + }); + + const data = await response.json(); + return data?.cast?.text || null; + } catch (error) { + console.error("Neynar API error:", error); + return null; + } +} + +function extractTwitterContent(html) { + const tweetTextMatch = html.match( + /data-testid="tweetText"[^>]*>(.*?)<\/div>/s, + ); + if (tweetTextMatch) { + return tweetTextMatch[1] + .replace(/<[^>]*>/g, " ") // Remove HTML tags + .replace(/\s+/g, " ") // Normalize whitespace + .trim(); + } + return null; +} + async function extractCanonicalLink(html) { const dom = parser(html); const node = dom.querySelector('link[rel="canonical"]'); @@ -94,7 +229,7 @@ export const metadata = async (url) => { const { hostname } = urlObj; - if (hostname === "twitter.com" || hostname === "x.com") { + if (twitterFrontends.includes(hostname)) { urlObj.hostname = "fxtwitter.com"; url = urlObj.toString(); } @@ -166,12 +301,27 @@ export const metadata = async (url) => { "youtube.com", "youtu.be", "reuters.com", - "xcancel.com", + "warpcast.com", + ...twitterFrontends, ]; let output = {}; - if (ogTitle && !bannedTitleDomains.includes(domain)) { - output.ogTitle = DOMPurify.sanitize(ogTitle); + if (hostname === "warpcast.com") { + const castContent = await extractWarpcastContent(url); + if (castContent) { + const claudeTitle = await generateClaudeTitle(castContent); + if (claudeTitle) { + output.ogTitle = claudeTitle; + } + } + } else if (bannedTitleDomains.includes(hostname)) { + const claudeTitle = await generateClaudeTitle(ogDescription); + if (claudeTitle) { + output.ogTitle = claudeTitle; + } + } else if (ogTitle) { + output.ogTitle = ogTitle; } + if (domain) { output.domain = DOMPurify.sanitize(domain); }