Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Search: stop relying on the DB when indexing #10696

Merged
merged 15 commits into from
Sep 14, 2023
74 changes: 44 additions & 30 deletions readthedocs/projects/tasks/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
log = structlog.get_logger(__name__)


# TODO: remove after deploy. This is kept, so builds keep
# working while we deploy the task below.
# TODO: remove `fileify` task after deploy. We keep it for now
# so builds keep working while we deploy the `index_build` task.
@app.task(queue="reindex")
def fileify(version_pk, commit, build, search_ranking, search_ignore):
"""
Expand Down Expand Up @@ -44,7 +44,6 @@ def fileify(version_pk, commit, build, search_ranking, search_ignore):
try:
_create_imported_files_and_search_index(
version=version,
build_id=build,
search_ranking=search_ranking,
search_ignore=search_ignore,
)
Expand All @@ -61,11 +60,18 @@ def index_build(build_id):
.first()
)
if not build:
log.debug(
"Skipping search indexing. Build object doesn't exists.", build_id=build_id
)
return

# The version may have been deleted.
version = build.version
if not version:
log.debug(
"Skipping search indexing. Build doesn't have a version attach it to it.",
build_id=build_id,
)
return

log.bind(
Expand All @@ -74,18 +80,14 @@ def index_build(build_id):
build_id=build.id,
)

search_ranking = []
search_ignore = []
build_config = build.config
if build_config:
search_config = build_config.get("search", {})
search_ranking = search_config.get("ranking", [])
search_ignore = search_config.get("ignore", [])
build_config = build.config or {}
search_config = build_config.get("search", {})
search_ranking = search_config.get("ranking", [])
search_ignore = search_config.get("ignore", [])

try:
_create_imported_files_and_search_index(
version=version,
build_id=build.id,
search_ranking=search_ranking,
search_ignore=search_ignore,
)
Expand All @@ -102,6 +104,10 @@ def reindex_version(version_id, search_index_name=None):
"""
version = Version.objects.filter(pk=version_id).select_related("project").first()
if not version or not version.built:
log.debug(
"Skipping search indexing. Version doesn't exist or is not built.",
version_id=version_id,
)
return

latest_successful_build = (
Expand All @@ -112,6 +118,10 @@ def reindex_version(version_id, search_index_name=None):
# If the version doesn't have a successful
# build, we don't have files to index.
if not latest_successful_build:
log.debug(
"Skipping search indexing. Version doesn't have a successful build.",
version_id=version_id,
)
return

log.bind(
Expand All @@ -120,23 +130,14 @@ def reindex_version(version_id, search_index_name=None):
build_id=latest_successful_build.id,
)

search_ranking = []
search_ignore = []
build_config = latest_successful_build.config
if build_config:
search_config = build_config.get("search", {})
search_ranking = search_config.get("ranking", [])
search_ignore = search_config.get("ignore", [])

# We need to use a build id that is different from the current one,
# otherwise we will end up with duplicated files. After the process
# is complete, we delete the files and search index that don't belong
# to the current build id. The build id isn't used for anything else.
build_id = latest_successful_build.id + 1
build_config = latest_successful_build.config or {}
search_config = build_config.get("search", {})
search_ranking = search_config.get("ranking", [])
search_ignore = search_config.get("ignore", [])

try:
_create_imported_files_and_search_index(
version=version,
build_id=build_id,
search_ranking=search_ranking,
search_ignore=search_ignore,
search_index_name=search_index_name,
Expand All @@ -155,7 +156,7 @@ def remove_search_indexes(project_slug, version_slug=None):


def _create_imported_files_and_search_index(
*, version, build_id, search_ranking, search_ignore, search_index_name=None
*, version, search_ranking, search_ignore, search_index_name=None
):
"""
Create imported files and search index for the build of the version.
Expand All @@ -170,6 +171,17 @@ def _create_imported_files_and_search_index(
storage_path = version.project.get_storage_path(
type_="html", version_slug=version.slug, include_file=False
)
# A sync ID is a number different than the current `build` attribute (pending rename),
# it's used to differentiate the files from the current sync from the previous one.
# This is useful to easily delete the previous files from the DB and ES.
imported_file = version.imported_files.first()
sync_id = imported_file.build + 1 if imported_file else 1

log.debug(
"Using sync ID for search indexing",
sync_id=sync_id,
)

html_files_to_index = []
html_files_to_save = []
reverse_rankings = list(reversed(search_ranking.items()))
Expand Down Expand Up @@ -211,11 +223,11 @@ def _create_imported_files_and_search_index(
path=relpath,
name=filename,
rank=page_rank,
# TODO: We are setting this field since it's required,
# TODO: We are setting the commit field since it's required,
# but it isn't used, and will be removed in the future
# together with other fields.
commit="unknown",
build=build_id,
build=sync_id,
ignore=skip_search_index,
)

Expand All @@ -242,18 +254,20 @@ def _create_imported_files_and_search_index(
remove_indexed_files(
project_slug=version.project.slug,
version_slug=version.slug,
build_id=build_id,
sync_id=sync_id,
index_name=search_index_name,
)

if html_files_to_save:
HTMLFile.objects.bulk_create(html_files_to_save)

# Delete imported files from the previous build of the version.
version.imported_files.exclude(build=build_id).delete()
version.imported_files.exclude(build=sync_id).delete()

# This signal is used for purging the CDN.
files_changed.send(
sender=Project,
project=version.project,
version=version,
)
return sync_id
66 changes: 31 additions & 35 deletions readthedocs/rtd_tests/tests/test_imported_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,12 @@ def tearDown(self):
# If there are no documents, the query fails.
pass

def _manage_imported_files(
self, version, build_id, search_ranking=None, search_ignore=None
):
def _manage_imported_files(self, version, search_ranking=None, search_ignore=None):
"""Helper function for the tests to create and sync ImportedFiles."""
search_ranking = search_ranking or {}
search_ignore = search_ignore or []
_create_imported_files_and_search_index(
return _create_imported_files_and_search_index(
version=version,
build_id=build_id,
search_ranking=search_ranking,
search_ignore=search_ignore,
)
Expand Down Expand Up @@ -70,57 +67,57 @@ def test_properly_created(self):
"""
self.assertEqual(ImportedFile.objects.count(), 0)

self._manage_imported_files(version=self.version, build_id=1)
sync_id = self._manage_imported_files(version=self.version)
self.assertEqual(ImportedFile.objects.count(), 3)
self.assertEqual(
set(HTMLFile.objects.all().values_list("path", flat=True)),
{"index.html", "api/index.html", "404.html"},
)

results = PageDocument().search().filter("term", build=1).execute()
results = PageDocument().search().filter("term", build=sync_id).execute()
self.assertEqual(
{result.path for result in results},
{"index.html", "404.html", "test.html", "api/index.html"},
)

self._manage_imported_files(version=self.version, build_id=2)
sync_id = self._manage_imported_files(version=self.version)
self.assertEqual(ImportedFile.objects.count(), 3)
self.assertEqual(
set(HTMLFile.objects.all().values_list("path", flat=True)),
{"index.html", "api/index.html", "404.html"},
)

results = PageDocument().search().filter("term", build=2).execute()
results = PageDocument().search().filter("term", build=sync_id).execute()
self.assertEqual(
{result.path for result in results},
{"index.html", "404.html", "test.html", "api/index.html"},
)

def test_update_build(self):
self.assertEqual(ImportedFile.objects.count(), 0)
self._manage_imported_files(self.version, build_id=1)
sync_id = self._manage_imported_files(self.version)
for obj in ImportedFile.objects.all():
self.assertEqual(obj.build, 1)
self.assertEqual(obj.build, sync_id)

results = PageDocument().search().filter().execute()
self.assertEqual(len(results), 4)
for result in results:
self.assertEqual(result.build, 1)
self.assertEqual(result.build, sync_id)

self._manage_imported_files(self.version, build_id=2)
sync_id = self._manage_imported_files(self.version)
for obj in ImportedFile.objects.all():
self.assertEqual(obj.build, 2)
self.assertEqual(obj.build, sync_id)

# NOTE: we can't test that the files from the previous build
# were deleted, since deletion happens asynchronously.
results = PageDocument().search().filter("term", build=2).execute()
results = PageDocument().search().filter("term", build=sync_id).execute()
self.assertEqual(len(results), 4)

def test_page_default_rank(self):
search_ranking = {}
self.assertEqual(HTMLFile.objects.count(), 0)
self._manage_imported_files(
self.version, build_id=1, search_ranking=search_ranking
sync_id = self._manage_imported_files(
self.version, search_ranking=search_ranking
)

results = (
Expand All @@ -134,15 +131,15 @@ def test_page_default_rank(self):
for result in results:
self.assertEqual(result.project, self.project.slug)
self.assertEqual(result.version, self.version.slug)
self.assertEqual(result.build, 1)
self.assertEqual(result.build, sync_id)
self.assertEqual(result.rank, 0)

def test_page_custom_rank_glob(self):
search_ranking = {
"*index.html": 5,
}
self._manage_imported_files(
self.version, build_id=1, search_ranking=search_ranking
sync_id = self._manage_imported_files(
self.version, search_ranking=search_ranking
)

results = (
Expand All @@ -156,7 +153,7 @@ def test_page_custom_rank_glob(self):
for result in results:
self.assertEqual(result.project, self.project.slug)
self.assertEqual(result.version, self.version.slug)
self.assertEqual(result.build, 1)
self.assertEqual(result.build, sync_id)
if result.path.endswith("index.html"):
self.assertEqual(result.rank, 5, result.path)
else:
Expand All @@ -167,8 +164,8 @@ def test_page_custom_rank_several(self):
"test.html": 5,
"api/index.html": 2,
}
self._manage_imported_files(
self.version, build_id=1, search_ranking=search_ranking
sync_id = self._manage_imported_files(
self.version, search_ranking=search_ranking
)

results = (
Expand All @@ -182,7 +179,7 @@ def test_page_custom_rank_several(self):
for result in results:
self.assertEqual(result.project, self.project.slug)
self.assertEqual(result.version, self.version.slug)
self.assertEqual(result.build, 1)
self.assertEqual(result.build, sync_id)
if result.path == "test.html":
self.assertEqual(result.rank, 5)
elif result.path == "api/index.html":
Expand All @@ -195,8 +192,8 @@ def test_page_custom_rank_precedence(self):
"*.html": 5,
"api/index.html": 2,
}
self._manage_imported_files(
self.version, build_id=1, search_ranking=search_ranking
sync_id = self._manage_imported_files(
self.version, search_ranking=search_ranking
)

results = (
Expand All @@ -210,7 +207,7 @@ def test_page_custom_rank_precedence(self):
for result in results:
self.assertEqual(result.project, self.project.slug)
self.assertEqual(result.version, self.version.slug)
self.assertEqual(result.build, 1)
self.assertEqual(result.build, sync_id)
if result.path == "api/index.html":
self.assertEqual(result.rank, 2, result.path)
else:
Expand All @@ -221,8 +218,8 @@ def test_page_custom_rank_precedence_inverted(self):
"api/index.html": 2,
"*.html": 5,
}
self._manage_imported_files(
self.version, build_id=1, search_ranking=search_ranking
sync_id = self._manage_imported_files(
self.version, search_ranking=search_ranking
)

results = (
Expand All @@ -236,14 +233,13 @@ def test_page_custom_rank_precedence_inverted(self):
for result in results:
self.assertEqual(result.project, self.project.slug)
self.assertEqual(result.version, self.version.slug)
self.assertEqual(result.build, 1)
self.assertEqual(result.build, sync_id)
self.assertEqual(result.rank, 5)

def test_search_page_ignore(self):
search_ignore = ["api/index.html"]
self._manage_imported_files(
self.version,
build_id=1,
search_ignore=search_ignore,
)

Expand Down Expand Up @@ -273,15 +269,15 @@ def test_update_content(self):
with override_settings(DOCROOT=self.test_dir):
self._copy_storage_dir()

self._manage_imported_files(self.version, build_id=1)
sync_id = self._manage_imported_files(self.version)
self.assertEqual(ImportedFile.objects.count(), 3)
document = (
PageDocument()
.search()
.filter("term", project=self.project.slug)
.filter("term", version=self.version.slug)
.filter("term", path="test.html")
.filter("term", build=1)
.filter("term", build=sync_id)
.execute()[0]
)
self.assertEqual(document.sections[0].content, "Woo")
Expand All @@ -292,15 +288,15 @@ def test_update_content(self):
with override_settings(DOCROOT=self.test_dir):
self._copy_storage_dir()

self._manage_imported_files(self.version, build_id=2)
sync_id = self._manage_imported_files(self.version)
self.assertEqual(ImportedFile.objects.count(), 3)
document = (
PageDocument()
.search()
.filter("term", project=self.project.slug)
.filter("term", version=self.version.slug)
.filter("term", path="test.html")
.filter("term", build=2)
.filter("term", build=sync_id)
.execute()[0]
)
self.assertEqual(document.sections[0].content, "Something Else")
Loading