Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Split ArticleSet into CandidateSet and RecommendationList #156

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3,651 changes: 1,682 additions & 1,969 deletions pixi.lock

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import torch as th

from poprox_concepts import ArticleSet, InterestProfile
from poprox_concepts import CandidateSet, InterestProfile
from poprox_concepts.domain import RecommendationList
from poprox_recommender.components.diversifiers.calibration import Calibrator
from poprox_recommender.topics import extract_locality, normalized_category_count

Expand All @@ -12,7 +13,7 @@ class LocalityCalibrator(Calibrator):
def __init__(self, theta: float = 0.1, num_slots=10):
super().__init__(theta, num_slots)

def __call__(self, candidate_articles: ArticleSet, interest_profile: InterestProfile) -> ArticleSet:
def __call__(self, candidate_articles: CandidateSet, interest_profile: InterestProfile) -> RecommendationList:
normalized_locality_prefs = normalized_category_count(interest_profile.click_locality_counts)

if candidate_articles.scores is not None:
Expand All @@ -29,7 +30,7 @@ def __call__(self, candidate_articles: ArticleSet, interest_profile: InterestPro
self.theta,
topk=self.num_slots,
)
return ArticleSet(articles=[candidate_articles.articles[int(idx)] for idx in article_indices])
return RecommendationList(articles=[candidate_articles.articles[int(idx)] for idx in article_indices])

def add_article_to_categories(self, rec_categories, article):
locality_list = extract_locality(article)
Expand Down
6 changes: 3 additions & 3 deletions src/poprox_recommender/components/diversifiers/mmr.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import torch
from lenskit.pipeline import Component

from poprox_concepts import ArticleSet, InterestProfile
from poprox_concepts import CandidateSet, InterestProfile
from poprox_recommender.pytorch.datachecks import assert_tensor_size
from poprox_recommender.pytorch.decorators import torch_inference

Expand All @@ -12,7 +12,7 @@ def __init__(self, theta: float = 0.8, num_slots: int = 10):
self.num_slots = num_slots

@torch_inference
def __call__(self, candidate_articles: ArticleSet, interest_profile: InterestProfile) -> ArticleSet:
def __call__(self, candidate_articles: CandidateSet, interest_profile: InterestProfile) -> CandidateSet:
if candidate_articles.scores is None:
return candidate_articles

Expand All @@ -21,7 +21,7 @@ def __call__(self, candidate_articles: ArticleSet, interest_profile: InterestPro
scores = torch.as_tensor(candidate_articles.scores).to(similarity_matrix.device)
article_indices = mmr_diversification(scores, similarity_matrix, theta=self.theta, topk=self.num_slots)

return ArticleSet(articles=[candidate_articles.articles[int(idx)] for idx in article_indices])
return CandidateSet(articles=[candidate_articles.articles[int(idx)] for idx in article_indices])


def compute_similarity_matrix(todays_article_vectors):
Expand Down
6 changes: 3 additions & 3 deletions src/poprox_recommender/components/diversifiers/pfar.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import torch as th
from lenskit.pipeline import Component

from poprox_concepts import Article, ArticleSet, InterestProfile
from poprox_concepts import Article, CandidateSet, InterestProfile
from poprox_recommender.pytorch.decorators import torch_inference
from poprox_recommender.topics import GENERAL_TOPICS, extract_general_topics, normalized_category_count

Expand All @@ -15,7 +15,7 @@ def __init__(self, lambda_: float = 1.0, tau: float | None = None, num_slots: in
self.num_slots = num_slots

@torch_inference
def __call__(self, candidate_articles: ArticleSet, interest_profile: InterestProfile) -> ArticleSet:
def __call__(self, candidate_articles: CandidateSet, interest_profile: InterestProfile) -> CandidateSet:
if candidate_articles.scores is None:
return candidate_articles

Expand All @@ -41,7 +41,7 @@ def __call__(self, candidate_articles: ArticleSet, interest_profile: InterestPro
topk=self.num_slots,
)

return ArticleSet(articles=[candidate_articles.articles[int(idx)] for idx in article_indices])
return CandidateSet(articles=[candidate_articles.articles[int(idx)] for idx in article_indices])


def pfar_diversification(relevance_scores, articles, topic_preferences, lamb, tau, topk) -> list[Article]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

import torch as th

from poprox_concepts import ArticleSet, InterestProfile
from poprox_concepts import CandidateSet, InterestProfile
from poprox_concepts.domain import RecommendationList
from poprox_recommender.components.diversifiers.calibration import Calibrator
from poprox_recommender.topics import extract_general_topics, normalized_category_count

Expand All @@ -11,7 +12,7 @@
# to rerank recommendations according to
# topic calibration
class TopicCalibrator(Calibrator):
def __call__(self, candidate_articles: ArticleSet, interest_profile: InterestProfile) -> ArticleSet:
def __call__(self, candidate_articles: CandidateSet, interest_profile: InterestProfile) -> RecommendationList:
normalized_topic_prefs = self.compute_topic_dist(interest_profile)

if candidate_articles.scores is not None:
Expand All @@ -29,7 +30,7 @@ def __call__(self, candidate_articles: ArticleSet, interest_profile: InterestPro
topk=self.num_slots,
)

return ArticleSet(articles=[candidate_articles.articles[int(idx)] for idx in article_indices])
return RecommendationList(articles=[candidate_articles.articles[int(idx)] for idx in article_indices])

def compute_topic_dist(self, interest_profile):
topic_preferences: dict[str, int] = defaultdict(int)
Expand Down
12 changes: 6 additions & 6 deletions src/poprox_recommender/components/embedders/article.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from safetensors.torch import load_file
from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast

from poprox_concepts import ArticleSet
from poprox_concepts import CandidateSet
from poprox_recommender.model import ModelConfig
from poprox_recommender.model.nrms.news_encoder import NewsEncoder
from poprox_recommender.paths import model_file_path
Expand Down Expand Up @@ -58,7 +58,7 @@ def __init__(self, model_path: PathLike, device: str | None):
self.embedding_cache = {}

@torch_inference
def __call__(self, article_set: ArticleSet) -> ArticleSet:
def __call__(self, article_set: CandidateSet) -> CandidateSet:
if not article_set.articles:
article_set.embeddings = th.zeros((0, self.news_encoder.embedding_size)) # type: ignore
return article_set
Expand Down Expand Up @@ -116,21 +116,21 @@ def __call__(self, article_set: ArticleSet) -> ArticleSet:

class EmbeddingCopier(Component):
@torch_inference
def __call__(self, candidate_set: ArticleSet, selected_set: ArticleSet) -> ArticleSet:
def __call__(self, candidate_set: CandidateSet, selected_set: CandidateSet) -> CandidateSet:
"""
Copies article embeddings from a candidate set to a set of selected/recommended articles

Parameters
----------
candidate_set : ArticleSet
candidate_set : CandidateSet
A set of candidate articles with the `.embeddings` property filled in
(e.g. with ArticleEmbedder)
selected_set : ArticleSet
selected_set : CandidateSet
A set of selected or recommended articles chosen from `candidate_set`

Returns
-------
ArticleSet
CandidateSet
selected_set with `.embeddings` set using the embeddings from `candidate_set`
"""
candidate_article_ids = [article.article_id for article in candidate_set.articles]
Expand Down
14 changes: 7 additions & 7 deletions src/poprox_recommender/components/embedders/topic_wise_user.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import torch as th

from poprox_concepts import Article, ArticleSet, Click, InterestProfile
from poprox_concepts import Article, CandidateSet, Click, InterestProfile
from poprox_recommender.components.embedders import NRMSArticleEmbedder, NRMSUserEmbedder
from poprox_recommender.paths import model_file_path
from poprox_recommender.pytorch.decorators import torch_inference
Expand Down Expand Up @@ -116,7 +116,7 @@ def virtual_clicks(onboarding_topics, topic_articles):

class UserOnboardingEmbedder(NRMSUserEmbedder):
article_embedder: NRMSArticleEmbedder
embedded_topic_articles: ArticleSet | None = None
embedded_topic_articles: CandidateSet | None = None

def __init__(self, *args, embedding_source: str = "static", topic_embedding: str = "nrms", **kwargs):
super().__init__(*args, **kwargs)
Expand All @@ -128,10 +128,10 @@ def __init__(self, *args, embedding_source: str = "static", topic_embedding: str

@torch_inference
def __call__(
self, candidate_articles: ArticleSet, clicked_articles: ArticleSet, interest_profile: InterestProfile
self, candidate_articles: CandidateSet, clicked_articles: CandidateSet, interest_profile: InterestProfile
) -> InterestProfile:
if self.embedded_topic_articles is None:
self.embedded_topic_articles = self.article_embedder(ArticleSet(articles=TOPIC_ARTICLES))
self.embedded_topic_articles = self.article_embedder(CandidateSet(articles=TOPIC_ARTICLES))

topic_embeddings_by_uuid = {
article.article_id: embedding
Expand Down Expand Up @@ -182,15 +182,15 @@ def __call__(

return interest_profile

def build_article_lookup(self, article_set: ArticleSet):
def build_article_lookup(self, article_set: CandidateSet):
embedding_lookup = {}
for article, article_vector in zip(article_set.articles, article_set.embeddings, strict=True):
if article.article_id not in embedding_lookup:
embedding_lookup[article.article_id] = article_vector

return embedding_lookup

def build_embeddings_from_articles(self, articles: ArticleSet, topic_articles: list[Article]):
def build_embeddings_from_articles(self, articles: CandidateSet, topic_articles: list[Article]):
topic_uuids_by_name = {article.external_id: article.article_id for article in topic_articles}

topic_embeddings_by_uuid = {}
Expand Down Expand Up @@ -220,7 +220,7 @@ def find_topical_articles(self, topic: str, articles: list[Article]) -> list[Art
return topical_articles

def build_embeddings_from_definitions(self):
topic_article_set = self.article_embedder(ArticleSet(articles=TOPIC_ARTICLES))
topic_article_set = self.article_embedder(CandidateSet(articles=TOPIC_ARTICLES))

topic_embeddings_by_uuid = {
article.article_id: embedding for article, embedding in zip(TOPIC_ARTICLES, topic_article_set.embeddings)
Expand Down
4 changes: 2 additions & 2 deletions src/poprox_recommender/components/embedders/user.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from lenskit.pipeline import Component
from safetensors.torch import load_file

from poprox_concepts import ArticleSet, Click, InterestProfile
from poprox_concepts import CandidateSet, Click, InterestProfile
from poprox_recommender.model import ModelConfig
from poprox_recommender.model.nrms.user_encoder import UserEncoder
from poprox_recommender.pytorch.decorators import torch_inference
Expand All @@ -23,7 +23,7 @@ def __init__(self, model_path: PathLike, device: str = "cpu", max_clicks_per_use
self.user_encoder.to(device)

@torch_inference
def __call__(self, clicked_articles: ArticleSet, interest_profile: InterestProfile) -> InterestProfile:
def __call__(self, clicked_articles: CandidateSet, interest_profile: InterestProfile) -> InterestProfile:
if len(clicked_articles.articles) == 0:
interest_profile.embedding = None
else:
Expand Down
6 changes: 3 additions & 3 deletions src/poprox_recommender/components/filters/topic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@

from lenskit.pipeline import Component

from poprox_concepts import ArticleSet, InterestProfile
from poprox_concepts import CandidateSet, InterestProfile

logger = logging.getLogger(__name__)


class TopicFilter(Component):
def __call__(self, candidate: ArticleSet, interest_profile: InterestProfile) -> ArticleSet:
def __call__(self, candidate: CandidateSet, interest_profile: InterestProfile) -> CandidateSet:
# Preference values from onboarding are 1-indexed, where 1 means "absolutely no interest."
# We might want to normalize them to 0-indexed somewhere upstream, but in the mean time
# this is one of the simpler ways to filter out topics people aren't interested in from
Expand All @@ -32,4 +32,4 @@ def __call__(self, candidate: ArticleSet, interest_profile: InterestProfile) ->
len(candidate.articles),
interest_profile.profile_id,
)
return ArticleSet(articles=topical_articles)
return CandidateSet(articles=topical_articles)
8 changes: 4 additions & 4 deletions src/poprox_recommender/components/joiners/concat.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from lenskit.pipeline import Component

from poprox_concepts import ArticleSet
from poprox_concepts.domain import RecommendationList


class Concatenate(Component):
def __call__(self, candidates1: ArticleSet, candidates2: ArticleSet) -> ArticleSet:
def __call__(self, recs1: RecommendationList, recs2: RecommendationList) -> RecommendationList:
"""
Concatenates two sets of candidates, while deduplicating them, keeping the
first occurrence of each article (by id), and maintaining their original order.
Expand All @@ -15,7 +15,7 @@ def __call__(self, candidates1: ArticleSet, candidates2: ArticleSet) -> ArticleS
the dict keys can be ignored and the dict values are the deduplicated candidates
in reverse order. Reversing them one more time returns them to the original order.
"""
reverse_articles = reversed(candidates1.articles + candidates2.articles)
reverse_articles = reversed(recs1.articles + recs2.articles)
articles = {article.article_id: article for article in reverse_articles}

return ArticleSet(articles=list(reversed(articles.values())))
return RecommendationList(articles=list(reversed(articles.values())))
17 changes: 10 additions & 7 deletions src/poprox_recommender/components/joiners/fill.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,19 @@
from lenskit.pipeline import Component
from lenskit.pipeline.types import Lazy

from poprox_concepts import ArticleSet
from poprox_concepts import CandidateSet
from poprox_concepts.domain import RecommendationList


class Fill(Component):
def __init__(self, num_slots: int, deduplicate: bool = True):
self.num_slots = num_slots
self.deduplicate = deduplicate

def __call__(self, candidates1: ArticleSet, candidates2: Lazy[ArticleSet]) -> ArticleSet:
articles = candidates1.articles
def __call__(
self, recs1: CandidateSet | RecommendationList, recs2: Lazy[CandidateSet | RecommendationList]
) -> RecommendationList:
articles = recs1.articles

if self.deduplicate:
# Track the articles by their article_id
Expand All @@ -19,7 +22,7 @@ def __call__(self, candidates1: ArticleSet, candidates2: Lazy[ArticleSet]) -> Ar
# Add articles from candidates2 only if they are not duplicates
if len(articles) < self.num_slots:
new_articles = []
for article in candidates2.get().articles:
for article in recs2.get().articles:
# Check if the article is a duplicate based on article_id
if (article.article_id) not in existing_articles:
new_articles.append(article)
Expand All @@ -30,7 +33,7 @@ def __call__(self, candidates1: ArticleSet, candidates2: Lazy[ArticleSet]) -> Ar

articles = articles + new_articles
else:
articles = articles + candidates2.get().articles
articles = articles + recs2.get().articles

# Return the resulting ArticleSet, limiting the size to num_slots
return ArticleSet(articles=articles[: self.num_slots])
# Return the resulting CandidateSet, limiting the size to num_slots
return CandidateSet(articles=articles[: self.num_slots])
8 changes: 4 additions & 4 deletions src/poprox_recommender/components/joiners/interleave.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@

from lenskit.pipeline import Component

from poprox_concepts import ArticleSet
from poprox_concepts.domain import RecommendationList


class Interleave(Component):
def __call__(self, candidates1: ArticleSet, candidates2: ArticleSet) -> ArticleSet:
def __call__(self, recs1: RecommendationList, recs2: RecommendationList) -> RecommendationList:
articles = []
for pair in zip_longest(candidates1.articles, candidates2.articles):
for pair in zip_longest(recs1.articles, recs2.articles):
for article in pair:
if article is not None:
articles.append(article)

return ArticleSet(articles=articles)
return RecommendationList(articles=articles)
10 changes: 5 additions & 5 deletions src/poprox_recommender/components/joiners/rrf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,16 @@

from lenskit.pipeline import Component

from poprox_concepts import ArticleSet
from poprox_concepts.domain import RecommendationList


class ReciprocalRankFusion(Component):
def __init__(self, num_slots: int, k: int = 60):
self.num_slots = num_slots
self.k = k

def __call__(self, candidates1: ArticleSet, candidates2: ArticleSet) -> ArticleSet:
articles = candidates1.articles
def __call__(self, recs1: RecommendationList, recs2: RecommendationList) -> RecommendationList:
articles = recs1.articles
article_scores = defaultdict(float)
articles_by_id = {}

Expand All @@ -20,7 +20,7 @@ def __call__(self, candidates1: ArticleSet, candidates2: ArticleSet) -> ArticleS
article_scores[article.article_id] = article_scores[article.article_id] + score
articles_by_id[article.article_id] = article

for i, article in enumerate(candidates2.articles, 1):
for i, article in enumerate(recs2.articles, 1):
score = 1 / (i + self.k)
article_scores[article.article_id] = article_scores[article.article_id] + score
articles_by_id[article.article_id] = article
Expand All @@ -32,4 +32,4 @@ def __call__(self, candidates1: ArticleSet, candidates2: ArticleSet) -> ArticleS
articles_by_id[article_id] for article_id in sorted_article_ids[: self.num_slots]
]

return ArticleSet(articles=reciprocal_rank_fusioned_articles)
return RecommendationList(articles=reciprocal_rank_fusioned_articles)
7 changes: 4 additions & 3 deletions src/poprox_recommender/components/rankers/topk.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
import numpy as np
from lenskit.pipeline import Component

from poprox_concepts import ArticleSet, InterestProfile
from poprox_concepts import CandidateSet, InterestProfile
from poprox_concepts.domain import RecommendationList


class TopkRanker(Component):
def __init__(self, num_slots=10):
self.num_slots = num_slots

def __call__(self, candidate_articles: ArticleSet, interest_profile: InterestProfile) -> ArticleSet:
def __call__(self, candidate_articles: CandidateSet, interest_profile: InterestProfile) -> RecommendationList:
articles = []
if candidate_articles.scores is not None:
article_indices = np.argsort(candidate_articles.scores)[-self.num_slots :][::-1]

articles = [candidate_articles.articles[int(idx)] for idx in article_indices]

return ArticleSet(articles=articles)
return RecommendationList(articles=articles)
Loading
Loading