Add optional redis caching to Priviblur (#26)

syeopite · Feb 19, 2024 · 46dd9ab · 46dd9ab
2 parents 7838996 + 4685afb
commit 46dd9ab
Show file tree

Hide file tree

Showing 28 changed files with 717 additions and 81 deletions.
diff --git a/assets/js/post.js b/assets/js/post.js
@@ -6,7 +6,15 @@ function requestPollResults(poll_element, pollId) {
         const blogName = post.getElementsByClassName("blog-name")[0].innerHTML;
         const postId = post.dataset.postId;
 
-        const pollResultsFetch = fetch(`/api/v1/poll/${blogName}/${postId}/${pollId}/results`);
+        let poll_results_url;
+
+        if (poll_element.classList.contains("expired-poll")) {
+            poll_results_url = `/api/v1/poll/${blogName}/${postId}/${pollId}/results?expired=1`;
+        } else {
+            poll_results_url = `/api/v1/poll/${blogName}/${postId}/${pollId}/results`;
+        }
+
+        const pollResultsFetch = fetch(poll_results_url);
 
         pollResultsFetch.then((results) => {
             return results.json();
@@ -17,7 +25,7 @@ function requestPollResults(poll_element, pollId) {
 }
 
 function fill_poll_results(poll_element, results) {
-    const sorted_poll_results = Object.entries(results.response.results).sort((a,b) => (a[1]-b[1])).reverse();
+    const sorted_poll_results = Object.entries(results.results).sort((a,b) => (a[1]-b[1])).reverse();
 
     // First we must find the total number of votes and the winner(s) of the poll
     let total_votes = 0;

diff --git a/config.example.toml b/config.example.toml
@@ -18,8 +18,29 @@
     # real_ip_header =
     # proxies_count =
 
+# Controls redis cache options
+# Ignore to disable the cache
+#
+# [cache]
+    # url =
+
+    # Number of seconds to cache poll results from active polls
+    # cache_active_poll_results_for = 3600
+
+    # Number of seconds to cache poll results from expired polls
+    # cache_expired_poll_results_for = 86400
+
+    # Number of seconds to cache feed (explore, search, etc) results for
+    # cache_feed_for = 3600
+
+    # Number of seconds to cache blog feed (blog posts, blog search, blog tagged posts, etc) results for
+    # cache_blog_feed_for = 3600
+
+    # Number of seconds to cache individual posts for
+    # cache_blog_post_for = 300
+
 # Controls behaviors pertaining to the way Priviblur requests Tumblr
-[priviblur_backend]
+# [priviblur_backend]
     # # Timeout for requests to Tumblr's API
     # main_response_timeout = 10
 
@@ -31,7 +52,7 @@
 #
 # Use Python's numerical logging levels
 # https://docs.python.org/3/howto/logging.html#logging-levels
-[logging]
+# [logging]
     # # Sanic (Server)'s logging level'
     # sanic_logging_level = 30
 
@@ -41,6 +62,7 @@
     # # Priviblur extractor's logging level
     # priviblur_extractor_logging_level = 20
 
-[misc]
+
+# [misc]
     # # Enable sanic's dev mode
     # dev_mode = false
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -10,3 +10,20 @@ services:
     volumes:
       - ./config.toml:/priviblur/config.toml:Z,ro
 
+  # Taken from https://github.com/zedeus/nitter/blob/b62d73dbd373f08af07c7a79efcd790d3bc1a49c/docker-compose.yml#L27-L44
+  priviblur-redis:
+    image: redis:6.2-alpine
+    container_name: priviblur-redis
+    command: redis-server --loglevel warning
+    volumes:
+      - priviblur-redis:/data
+    restart: unless-stopped
+    user: "999:1000"
+    read_only: true
+    security_opt:
+      - no-new-privileges:true
+    cap_drop:
+      - ALL
+
+volumes:
+  priviblur-redis:
diff --git a/requirements.txt b/requirements.txt
@@ -15,6 +15,7 @@ multidict==6.0.5
 npf_renderer==0.12.0
 orjson==3.9.13
 PyYAML==6.0.1
+redis==5.0.1
 sanic==23.12.1
 sanic-ext==23.12.0
 sanic-routing==23.12.0

diff --git a/src/cache/__init__.py b/src/cache/__init__.py
@@ -0,0 +1,5 @@
+from .poll_results import get_poll_results
+from .search import get_search_results
+from .explore import get_explore_results
+from .tagged import get_tag_browse_results
+from .blogs import get_blog_posts, get_blog_post
diff --git a/src/cache/base.py b/src/cache/base.py
@@ -0,0 +1,116 @@
+import abc
+import typing
+
+import orjson
+
+from .. import priviblur_extractor
+
+class AccessCache(abc.ABC):
+    def __init__(self, ctx, prefix, cache_ttl, continuation=None, **kwargs):
+        self.ctx = ctx
+        self.prefix = prefix
+        self.cache_ttl = cache_ttl
+
+        self.continuation = continuation
+        self.kwargs = kwargs
+
+    @abc.abstractmethod
+    def fetch(self) -> typing.Dict[str, typing.Any]:
+        """Fetches results from Tumblr"""
+        pass
+
+    @abc.abstractmethod
+    def parse(self, initial_results):
+        """Parses the initial JSON response from Tumblr"""
+        pass
+
+    @abc.abstractmethod
+    def build_key(self) -> typing.Tuple[str, str]:
+        """Creates a key to get/store an item within the cache"""
+        pass
+
+    def parse_cached_json(self, json):
+        return priviblur_extractor.models.timeline.Timeline.from_json(json)
+
+    def get_key(self):
+        base_key = self.build_key()
+
+        if self.continuation:
+            full_key_with_continuation = f"{base_key}:{self.continuation}"
+        else:
+            full_key_with_continuation = base_key
+
+        return base_key, full_key_with_continuation
+
+    async def parse_and_cache(self, base_key, full_key_with_continuation, initial_results):
+        """Inserts the given results into the cache within the given key
+        
+        Creates a placeholder item within the cache for the next continuation batch if applicable
+        """
+        pipeline = self.ctx.CacheDb.pipeline()
+
+        timeline = self.parse(initial_results)
+
+        pipeline.set(full_key_with_continuation, orjson.dumps(timeline.to_json_serialisable()))
+        pipeline.expire(full_key_with_continuation, self.cache_ttl)
+
+        # Allocate key slot for the next continuation
+        #
+        # When a given continuation is invalid Tumblr returns the data for the initial page. As such,
+        # we need to add in an extra check here to ensure that a malicious user does not arbitrarily add
+        # in data to the cache
+        # 
+        # "0" is used as a placeholder 
+
+        if timeline.next and timeline.next.cursor:
+            next_key = f"{base_key}:{timeline.next.cursor}"
+            pipeline.setnx(next_key, "0")
+            pipeline.expire(next_key, self.cache_ttl)
+
+            self.ctx.LOGGER.debug("Cache: Allocating a slot for continuation batch with key \"%s\"", next_key)
+
+        await pipeline.execute()
+
+        return timeline
+
+    async def get_cached(self):
+        """Retrieves an item from the cache
+        
+        Fetches new data and inserts into the cache when it is unable to do so
+        """
+        base_key, full_key_with_continuation = self.get_key()
+        cached_result = await self.ctx.CacheDb.get(full_key_with_continuation)
+
+        # See comment in self.parse_and_cache as to why "0"
+        if not cached_result or cached_result == "0":
+            initial_results = await self.fetch()
+
+            # When the current request has a continuation token attached, we'll only cache
+            # when a slot has already been allocated for it from the previous request.
+            if self.continuation and not cached_result:
+                return self.parse(initial_results)
+            else:
+                self.ctx.LOGGER.info("Cache: Adding \"%s\" to the cache", full_key_with_continuation)
+                return await self.parse_and_cache(base_key, full_key_with_continuation, initial_results)
+        else:
+            self.ctx.LOGGER.info("Cache: Cached version of \"%s\" found", full_key_with_continuation)
+
+            initial_results_from_cache = orjson.loads(cached_result)
+
+            if initial_results_from_cache["version"] != priviblur_extractor.models.VERSION:
+                self.ctx.LOGGER.debug(
+                    "Cache: Version mismatch! Cached object is from a different version of Priviblur (%(cached_version)s != %(priviblur_version)s). Fetching new response...",
+                    dict(cached_version=initial_results_from_cache["version"], priviblur_version=priviblur_extractor.models.VERSION)
+                )
+                new_initial_results = await self.fetch()
+                return await self.parse_and_cache(base_key, full_key_with_continuation, new_initial_results)
+
+            return self.parse_cached_json(initial_results_from_cache)
+
+    async def get(self):
+        """Retrieves some data from either the cache or Tumblr itself"""
+        if self.ctx.CacheDb:
+            return await self.get_cached()
+        else:
+            initial_results = await self.fetch()
+            return self.parse(initial_results)
diff --git a/src/cache/blogs.py b/src/cache/blogs.py
@@ -0,0 +1,74 @@
+import orjson
+
+from .base import AccessCache
+from .. import priviblur_extractor
+
+
+class BlogPostsCache(AccessCache):
+    def __init__(self, ctx, blog, continuation, **kwargs):
+        super().__init__(
+            ctx=ctx,
+            prefix=f"blog:{blog}",
+            cache_ttl=ctx.PRIVIBLUR_CONFIG.cache.cache_blog_feed_for,
+            continuation=continuation,
+            **kwargs
+        )
+
+        self.blog = blog
+
+    async def fetch(self):
+        """Fetches blog posts from Tumblr"""
+        return await self.ctx.TumblrAPI.blog_posts(self.blog, continuation=self.continuation, **self.kwargs)
+
+    def parse(self, initial_results):
+        return priviblur_extractor.parse_blog_timeline(initial_results)
+
+    def parse_cached_json(self, json):
+        return priviblur_extractor.models.blog.Blog.from_json(json)
+
+    def build_key(self):
+        # blog:<blog_name>:<kwargs>:<continuation>
+        path_to_cached_results = [self.prefix, ]
+        for k,v in self.kwargs.items():
+            if v:
+                path_to_cached_results.append(f"{k}:{v}")
+
+        return ':'.join(path_to_cached_results)
+
+
+class BlogPostCache(AccessCache):
+    def __init__(self, ctx, blog, post_id, **kwargs):
+        super().__init__(
+            ctx=ctx,
+            prefix=f"blog:{blog}:post:{post_id}",
+            cache_ttl=ctx.PRIVIBLUR_CONFIG.cache.cache_blog_post_for,
+            **kwargs
+        )
+
+        self.blog = blog
+        self.post_id = post_id
+
+    async def fetch(self):
+        return await self.ctx.TumblrAPI.blog_post(self.blog, self.post_id, **self.kwargs)
+
+    def parse(self, initial_results):
+        return priviblur_extractor.parse_timeline(initial_results)
+
+    def build_key(self):
+        # blog:<blog_name>:post:<post_id>:<kwargs>
+        path_to_cached_results = [self.prefix, ]
+        for k,v in self.kwargs.items():
+            if v:
+                path_to_cached_results.append(f"{k}:{v}")
+
+        return ':'.join(path_to_cached_results)
+
+
+async def get_blog_posts(ctx, blog, continuation=None, **kwargs):
+    blog_posts_cache = BlogPostsCache(ctx, blog, continuation, **kwargs)
+    return await blog_posts_cache.get()
+
+
+async def get_blog_post(ctx, blog, post_id, **kwargs):
+    blog_post_cache = BlogPostCache(ctx, blog, post_id, **kwargs)
+    return await blog_post_cache.get()
diff --git a/src/cache/explore.py b/src/cache/explore.py
@@ -0,0 +1,35 @@
+import orjson
+
+from .base import AccessCache
+from .. import priviblur_extractor
+
+
+class ExploreCache(AccessCache):
+    def __init__(self, ctx, type_, continuation, fetch_function, **kwargs):
+        super().__init__(
+            ctx=ctx,
+            prefix=f"explore:{type_}",
+            cache_ttl=ctx.PRIVIBLUR_CONFIG.cache.cache_feed_for,
+            continuation=continuation,
+            **kwargs
+        )
+
+        self.fetch_function = fetch_function
+
+    async def fetch(self):
+        """Fetches search results from Tumblr"""
+        return await self.fetch_function(
+            continuation=self.continuation,
+            **self.kwargs
+        )
+
+    def parse(self, initial_results):
+        return priviblur_extractor.parse_timeline(initial_results)
+
+    def build_key(self):
+        return self.prefix
+
+
+async def get_explore_results(ctx, fetch_function, type_, continuation, **kwargs):
+    search_cache = ExploreCache(ctx, type_, continuation, fetch_function, **kwargs)
+    return await search_cache.get()
diff --git a/src/cache/poll_results.py b/src/cache/poll_results.py
@@ -0,0 +1,50 @@
+import orjson
+
+async def get_poll_results(ctx, blog, post_id,poll_id, expired=False):
+    """Gets poll results from the given data
+    
+    Attempts to retrieve from the cache first and foremost, and only requests when the data is either unavailable or expired.
+    """
+    if ctx.CacheDb:
+        cached_result = await ctx.CacheDb.hgetall(f"polls:{poll_id}")
+        if cached_result:
+            timestamp = cached_result.pop("timestamp")
+            poll_results = {k:int(v) for k, v in cached_result.items()}
+
+            return {"timestamp": timestamp, "results": poll_results}
+        else:
+            initial_results = await _fetch_poll_results(ctx.TumblrAPI, blog, post_id, poll_id)
+            await _cache_poll_results(ctx, initial_results, poll_id, expired)
+
+            return initial_results
+    else:
+        return await _fetch_poll_results(ctx.TumblrAPI, blog, post_id, poll_id)
+
+
+async def _fetch_poll_results(tumblr_api, blog, post_id, poll_id):
+    """Requests Tumblr for poll results"""
+    initial_results = await tumblr_api.poll_results(blog, post_id, poll_id)
+    return initial_results["response"]
+
+
+async def _cache_poll_results(ctx, results, poll_id, expired):
+    """Caches the given poll results"""
+    if expired:
+        ttl = ctx.PRIVIBLUR_CONFIG.cache.cache_expired_poll_results_for
+    else:
+        ttl = ctx.PRIVIBLUR_CONFIG.cache.cache_active_poll_results_for
+
+    pipeline = ctx.CacheDb.pipeline()
+
+    cache_id = f"polls:{poll_id}"
+
+    pipeline.hset(cache_id, mapping={
+        **results["results"],
+        "timestamp": results["timestamp"],
+    })
+
+    pipeline.expire(cache_id, ttl)
+
+    await pipeline.execute()
+
+