Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Search: stop relying on the DB when indexing #10696

Merged
merged 15 commits into from
Sep 14, 2023
2 changes: 1 addition & 1 deletion docs/dev/server-side-search.rst
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ You can fix this by deleting the page index and :ref:`re-indexing <server-side-s
.. prompt:: bash

inv docker.manage 'search_index --delete'
inv docker.manage reindex_elasticsearch
inv docker.manage 'reindex_elasticsearch --queue web'

How we index documentations
~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
39 changes: 4 additions & 35 deletions readthedocs/builds/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@
)
from readthedocs.core.utils import trigger_build
from readthedocs.core.utils.admin import pretty_json_field
from readthedocs.projects.models import HTMLFile
from readthedocs.search.utils import _indexing_helper
from readthedocs.projects.tasks.search import reindex_version


class BuildCommandResultInline(admin.TabularInline):
Expand Down Expand Up @@ -89,7 +88,7 @@ class VersionAdmin(admin.ModelAdmin):
list_filter = ("type", "privacy_level", "active", "built")
search_fields = ("slug", "project__slug")
raw_id_fields = ("project",)
actions = ["build_version", "reindex_version", "wipe_version_indexes"]
actions = ["build_version", "reindex_version"]

def project_slug(self, obj):
return obj.project.slug
Expand Down Expand Up @@ -117,41 +116,11 @@ def build_version(self, request, queryset):
@admin.action(description="Reindex version to ES")
def reindex_version(self, request, queryset):
"""Reindexes all selected versions to ES."""
html_objs_qs = []
for version in queryset.iterator():
html_objs = HTMLFile.objects.filter(
project=version.project, version=version
)

if html_objs.exists():
html_objs_qs.append(html_objs)

if html_objs_qs:
_indexing_helper(html_objs_qs, wipe=False)
for version_id in queryset.values_list("id", flat=True).iterator():
reindex_version.delay(version_id)

self.message_user(request, "Task initiated successfully.", messages.SUCCESS)

@admin.action(description="Wipe version from ES")
def wipe_version_indexes(self, request, queryset):
"""Wipe selected versions from ES."""
html_objs_qs = []
for version in queryset.iterator():
html_objs = HTMLFile.objects.filter(
project=version.project, version=version
)

if html_objs.exists():
html_objs_qs.append(html_objs)

if html_objs_qs:
_indexing_helper(html_objs_qs, wipe=True)

self.message_user(
request,
"Task initiated successfully",
messages.SUCCESS,
)


@admin.register(RegexAutomationRule)
class RegexAutomationRuleAdmin(PolymorphicChildModelAdmin, admin.ModelAdmin):
Expand Down
2 changes: 1 addition & 1 deletion readthedocs/builds/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,7 @@ def config(self):
:rtype: dict
"""
last_build = (
self.builds(manager=INTERNAL).filter(
self.builds.filter(
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't need to filter builds by internal/external here, if we are accessing the version, we already know if it's external o internal.

state=BUILD_STATE_FINISHED,
success=True,
).order_by('-date')
Expand Down
22 changes: 22 additions & 0 deletions readthedocs/builds/querysets.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,28 @@ def public(
def api(self, user=None):
return self.public(user, only_active=False)

def for_reindex(self):
"""
Get all versions that can be reindexed.

A version can be reindexed if:

- It's active and has been built at least once successfully.
Since that means that it has files to be indexed.
- Its project is not delisted or marked as spam.
"""
return (
self.filter(
active=True,
built=True,
builds__state=BUILD_STATE_FINISHED,
builds__success=True,
)
.exclude(project__delisted=True)
.exclude(project__is_spam=True)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should probably use a score here. Otherwise, only projects manually marked as spam will be excluded here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a copy of

def get_queryset(self):
"""Don't include ignored files and delisted projects."""
queryset = super().get_queryset()
queryset = (
queryset
.exclude(ignore=True)
.exclude(project__delisted=True)
.exclude(project__is_spam=True)
.select_related('version', 'project')
)
return queryset

probably better discuss this at #9899

.distinct()
)


class VersionQuerySet(SettingsOverrideObject):
_default_class = VersionQuerySetBase
Expand Down
57 changes: 8 additions & 49 deletions readthedocs/projects/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
from readthedocs.core.history import ExtraSimpleHistoryAdmin, set_change_reason
from readthedocs.core.utils import trigger_build
from readthedocs.notifications.views import SendNotificationView
from readthedocs.projects.tasks.search import reindex_version
from readthedocs.redirects.models import Redirect
from readthedocs.search.utils import _indexing_helper

from .forms import FeatureForm
from .models import (
Expand Down Expand Up @@ -264,7 +264,6 @@ class ProjectAdmin(ExtraSimpleHistoryAdmin):
"run_spam_rule_checks",
"build_default_version",
"reindex_active_versions",
"wipe_all_versions",
"import_tags_from_vcs",
]

Expand Down Expand Up @@ -362,66 +361,26 @@ def reindex_active_versions(self, request, queryset):
"""Reindex all active versions of the selected projects to ES."""
qs_iterator = queryset.iterator()
for project in qs_iterator:
version_qs = Version.internal.filter(project=project)
active_versions = version_qs.filter(active=True)
versions_id_to_reindex = project.versions.for_reindex().values_list(
"pk", flat=True
)

if not active_versions.exists():
if not versions_id_to_reindex.exists():
self.message_user(
request,
"No active versions of project {}".format(project),
"No versions to be re-indexed for project {}".format(project),
messages.ERROR,
)
else:
html_objs_qs = []
for version in active_versions.iterator():
html_objs = HTMLFile.objects.filter(
project=project, version=version
)

if html_objs.exists():
html_objs_qs.append(html_objs)

if html_objs_qs:
_indexing_helper(html_objs_qs, wipe=False)
for version_id in versions_id_to_reindex.iterator():
reindex_version.delay(version_id)

self.message_user(
request,
"Task initiated successfully for {}".format(project),
messages.SUCCESS,
)

# TODO: rename method to mention "indexes" on its name
@admin.action(description="Wipe all versions from ES")
def wipe_all_versions(self, request, queryset):
"""Wipe indexes of all versions of selected projects."""
qs_iterator = queryset.iterator()
for project in qs_iterator:
version_qs = Version.internal.filter(project=project)
if not version_qs.exists():
self.message_user(
request,
"No active versions of project {}.".format(project),
messages.ERROR,
)
else:
html_objs_qs = []
for version in version_qs.iterator():
html_objs = HTMLFile.objects.filter(
project=project, version=version
)

if html_objs.exists():
html_objs_qs.append(html_objs)

if html_objs_qs:
_indexing_helper(html_objs_qs, wipe=True)

self.message_user(
request,
"Task initiated successfully for {}.".format(project),
messages.SUCCESS,
)

@admin.action(description="Import tags from the version control API")
def import_tags_from_vcs(self, request, queryset):
for project in queryset.iterator():
Expand Down
10 changes: 2 additions & 8 deletions readthedocs/projects/tasks/builds.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@
from ..models import APIProject, WebHookEvent
from ..signals import before_vcs
from .mixins import SyncRepositoryMixin
from .search import fileify
from .search import index_build
from .utils import (
BuildRequest,
clean_build,
Expand Down Expand Up @@ -653,13 +653,7 @@ def on_success(self, retval, task_id, args, kwargs):
)

# Index search data
fileify.delay(
version_pk=self.data.version.pk,
commit=self.data.build['commit'],
build=self.data.build['id'],
search_ranking=self.data.config.search.ranking,
search_ignore=self.data.config.search.ignore,
)
index_build.delay(build_id=self.data.build["id"])

if not self.data.project.has_valid_clone:
self.set_valid_clone()
Expand Down
Loading