From babeba248d301260e06577f95c0084eff4d70057 Mon Sep 17 00:00:00 2001 From: Tim Fischer Date: Tue, 11 Feb 2025 11:13:38 +0000 Subject: [PATCH 1/3] added explicit join conditions (on clauses) --- .../timeline_analysis_columns.py | 43 ++++++++++-------- .../word_frequency_columns.py | 44 ++++++++++-------- .../bbox_anno_search_columns.py | 11 ++++- .../search/sdoc_search/sdoc_search_columns.py | 45 +++++++++++-------- .../sent_anno_search_columns.py | 11 ++++- .../span_anno_search_columns.py | 11 ++++- 6 files changed, 102 insertions(+), 63 deletions(-) diff --git a/backend/src/app/core/analysis/timeline_analysis/timeline_analysis_columns.py b/backend/src/app/core/analysis/timeline_analysis/timeline_analysis_columns.py index 049ea8b33..42b8b10f8 100644 --- a/backend/src/app/core/analysis/timeline_analysis/timeline_analysis_columns.py +++ b/backend/src/app/core/analysis/timeline_analysis/timeline_analysis_columns.py @@ -2,12 +2,10 @@ from sqlalchemy.dialects.postgresql import ARRAY, array, array_agg from app.core.data.orm.annotation_document import AnnotationDocumentORM -from app.core.data.orm.code import CodeORM from app.core.data.orm.document_tag import DocumentTagORM from app.core.data.orm.source_document import SourceDocumentORM from app.core.data.orm.span_annotation import SpanAnnotationORM from app.core.data.orm.span_text import SpanTextORM -from app.core.data.orm.user import UserORM from app.core.db.sql_utils import aggregate_ids from app.core.search.column_info import AbstractColumns from app.core.search.filtering_operators import FilterOperator, FilterValueType @@ -101,29 +99,31 @@ def add_subquery_filter_statements(self, query_builder: SearchBuilder): case TimelineAnalysisColumns.CODE_ID_LIST: query_builder._add_subquery_column( aggregate_ids( - CodeORM.id, label=TimelineAnalysisColumns.CODE_ID_LIST.value + SpanAnnotationORM.code_id, + label=TimelineAnalysisColumns.CODE_ID_LIST.value, ) ) query_builder._join_subquery( - SourceDocumentORM.annotation_documents, + AnnotationDocumentORM, + AnnotationDocumentORM.source_document_id == SourceDocumentORM.id, isouter=True, ) query_builder._join_subquery( - SpanAnnotationORM.code, + SpanAnnotationORM, + AnnotationDocumentORM.source_document_id == SourceDocumentORM.id, isouter=True, ) + case TimelineAnalysisColumns.USER_ID_LIST: query_builder._add_subquery_column( aggregate_ids( - UserORM.id, TimelineAnalysisColumns.USER_ID_LIST.value + AnnotationDocumentORM.user_id, + TimelineAnalysisColumns.USER_ID_LIST.value, ) ) query_builder._join_subquery( - SourceDocumentORM.annotation_documents, - isouter=True, - ) - query_builder._join_subquery( - AnnotationDocumentORM.user, + AnnotationDocumentORM, + AnnotationDocumentORM.source_document_id == SourceDocumentORM.id, isouter=True, ) case TimelineAnalysisColumns.SPAN_ANNOTATIONS: @@ -131,26 +131,31 @@ def add_subquery_filter_statements(self, query_builder: SearchBuilder): cast( array_agg( func.distinct( - array([cast(CodeORM.id, String), SpanTextORM.text]) + array( + [ + cast(SpanAnnotationORM.code_id, String), + SpanTextORM.text, + ] + ) ), ), ARRAY(String, dimensions=2), ).label(TimelineAnalysisColumns.SPAN_ANNOTATIONS.value) ) query_builder._join_subquery( - SourceDocumentORM.annotation_documents, - isouter=True, - ) - query_builder._join_subquery( - AnnotationDocumentORM.span_annotations, + AnnotationDocumentORM, + AnnotationDocumentORM.source_document_id == SourceDocumentORM.id, isouter=True, ) query_builder._join_subquery( - SpanAnnotationORM.span_text, + SpanAnnotationORM, + SpanAnnotationORM.annotation_document_id + == AnnotationDocumentORM.id, isouter=True, ) query_builder._join_subquery( - SpanAnnotationORM.code, + SpanTextORM, + SpanTextORM.id == SpanAnnotationORM.span_text_id, isouter=True, ) diff --git a/backend/src/app/core/analysis/word_frequency_analysis/word_frequency_columns.py b/backend/src/app/core/analysis/word_frequency_analysis/word_frequency_columns.py index ed29065fe..298838022 100644 --- a/backend/src/app/core/analysis/word_frequency_analysis/word_frequency_columns.py +++ b/backend/src/app/core/analysis/word_frequency_analysis/word_frequency_columns.py @@ -2,12 +2,10 @@ from sqlalchemy.dialects.postgresql import ARRAY, array, array_agg from app.core.data.orm.annotation_document import AnnotationDocumentORM -from app.core.data.orm.code import CodeORM from app.core.data.orm.document_tag import DocumentTagORM from app.core.data.orm.source_document import SourceDocumentORM from app.core.data.orm.span_annotation import SpanAnnotationORM from app.core.data.orm.span_text import SpanTextORM -from app.core.data.orm.user import UserORM from app.core.data.orm.word_frequency import WordFrequencyORM from app.core.db.sql_utils import aggregate_ids from app.core.search.column_info import AbstractColumns @@ -157,27 +155,30 @@ def add_subquery_filter_statements(self, query_builder: SearchBuilder): case WordFrequencyColumns.CODE_ID_LIST: query_builder._add_subquery_column( aggregate_ids( - CodeORM.id, label=WordFrequencyColumns.CODE_ID_LIST.value + SpanAnnotationORM.code_id, + label=WordFrequencyColumns.CODE_ID_LIST.value, ) ) query_builder._join_subquery( - SourceDocumentORM.annotation_documents, + AnnotationDocumentORM, + AnnotationDocumentORM.source_document_id == SourceDocumentORM.id, isouter=True, ) query_builder._join_subquery( - SpanAnnotationORM.code, + SpanAnnotationORM, + AnnotationDocumentORM.source_document_id == SourceDocumentORM.id, isouter=True, ) case WordFrequencyColumns.USER_ID_LIST: query_builder._add_subquery_column( - aggregate_ids(UserORM.id, WordFrequencyColumns.USER_ID_LIST.value) - ) - query_builder._join_subquery( - SourceDocumentORM.annotation_documents, - isouter=True, + aggregate_ids( + AnnotationDocumentORM.user_id, + WordFrequencyColumns.USER_ID_LIST.value, + ) ) query_builder._join_subquery( - AnnotationDocumentORM.user, + AnnotationDocumentORM, + AnnotationDocumentORM.source_document_id == SourceDocumentORM.id, isouter=True, ) case WordFrequencyColumns.SPAN_ANNOTATIONS: @@ -185,26 +186,31 @@ def add_subquery_filter_statements(self, query_builder: SearchBuilder): cast( array_agg( func.distinct( - array([cast(CodeORM.id, String), SpanTextORM.text]) + array( + [ + cast(SpanAnnotationORM.code_id, String), + SpanTextORM.text, + ] + ) ), ), ARRAY(String, dimensions=2), ).label(WordFrequencyColumns.SPAN_ANNOTATIONS.value) ) query_builder._join_subquery( - SourceDocumentORM.annotation_documents, - isouter=True, - ) - query_builder._join_subquery( - AnnotationDocumentORM.span_annotations, + AnnotationDocumentORM, + AnnotationDocumentORM.source_document_id == SourceDocumentORM.id, isouter=True, ) query_builder._join_subquery( - SpanAnnotationORM.span_text, + SpanAnnotationORM, + SpanAnnotationORM.annotation_document_id + == AnnotationDocumentORM.id, isouter=True, ) query_builder._join_subquery( - SpanAnnotationORM.code, + SpanTextORM, + SpanTextORM.id == SpanAnnotationORM.span_text_id, isouter=True, ) diff --git a/backend/src/app/core/search/bbox_anno_search/bbox_anno_search_columns.py b/backend/src/app/core/search/bbox_anno_search/bbox_anno_search_columns.py index 70e9854e5..81328c0b5 100644 --- a/backend/src/app/core/search/bbox_anno_search/bbox_anno_search_columns.py +++ b/backend/src/app/core/search/bbox_anno_search/bbox_anno_search_columns.py @@ -81,8 +81,15 @@ def add_subquery_filter_statements(self, query_builder: SearchBuilder): label=BBoxColumns.DOCUMENT_TAG_ID_LIST.value, ) ) - query_builder._join_subquery(BBoxAnnotationORM.annotation_document) - query_builder._join_subquery(AnnotationDocumentORM.source_document) + query_builder._join_subquery( + AnnotationDocumentORM, + AnnotationDocumentORM.id + == BBoxAnnotationORM.annotation_document_id, + ) + query_builder._join_subquery( + SourceDocumentORM, + SourceDocumentORM.id == AnnotationDocumentORM.source_document_id, + ) query_builder._join_subquery( SourceDocumentORM.document_tags, isouter=True ) diff --git a/backend/src/app/core/search/sdoc_search/sdoc_search_columns.py b/backend/src/app/core/search/sdoc_search/sdoc_search_columns.py index 24933f61a..e54aa3e9e 100644 --- a/backend/src/app/core/search/sdoc_search/sdoc_search_columns.py +++ b/backend/src/app/core/search/sdoc_search/sdoc_search_columns.py @@ -2,12 +2,10 @@ from sqlalchemy.dialects.postgresql import ARRAY, array, array_agg from app.core.data.orm.annotation_document import AnnotationDocumentORM -from app.core.data.orm.code import CodeORM from app.core.data.orm.document_tag import DocumentTagORM from app.core.data.orm.source_document import SourceDocumentORM from app.core.data.orm.span_annotation import SpanAnnotationORM from app.core.data.orm.span_text import SpanTextORM -from app.core.data.orm.user import UserORM from app.core.db.sql_utils import aggregate_ids from app.core.search.column_info import AbstractColumns from app.core.search.filtering_operators import FilterOperator, FilterValueType @@ -111,26 +109,30 @@ def add_subquery_filter_statements(self, query_builder: SearchBuilder): ) case SdocColumns.CODE_ID_LIST: query_builder._add_subquery_column( - aggregate_ids(CodeORM.id, label=SdocColumns.CODE_ID_LIST.value) + aggregate_ids( + SpanAnnotationORM.code_id, label=SdocColumns.CODE_ID_LIST.value + ) ) query_builder._join_subquery( - SourceDocumentORM.annotation_documents, + AnnotationDocumentORM, + AnnotationDocumentORM.source_document_id == SourceDocumentORM.id, isouter=True, ) query_builder._join_subquery( - SpanAnnotationORM.code, + SpanAnnotationORM, + AnnotationDocumentORM.source_document_id == SourceDocumentORM.id, isouter=True, ) + case SdocColumns.USER_ID_LIST: query_builder._add_subquery_column( - aggregate_ids(UserORM.id, SdocColumns.USER_ID_LIST.value) - ) - query_builder._join_subquery( - SourceDocumentORM.annotation_documents, - isouter=True, + aggregate_ids( + AnnotationDocumentORM.user_id, SdocColumns.USER_ID_LIST.value + ) ) query_builder._join_subquery( - AnnotationDocumentORM.user, + AnnotationDocumentORM, + AnnotationDocumentORM.source_document_id == SourceDocumentORM.id, isouter=True, ) case SdocColumns.SPAN_ANNOTATIONS: @@ -138,26 +140,31 @@ def add_subquery_filter_statements(self, query_builder: SearchBuilder): cast( array_agg( func.distinct( - array([cast(CodeORM.id, String), SpanTextORM.text]) + array( + [ + cast(SpanAnnotationORM.code_id, String), + SpanTextORM.text, + ] + ) ), ), ARRAY(String, dimensions=2), ).label(SdocColumns.SPAN_ANNOTATIONS.value) ) query_builder._join_subquery( - SourceDocumentORM.annotation_documents, - isouter=True, - ) - query_builder._join_subquery( - AnnotationDocumentORM.span_annotations, + AnnotationDocumentORM, + AnnotationDocumentORM.source_document_id == SourceDocumentORM.id, isouter=True, ) query_builder._join_subquery( - SpanAnnotationORM.span_text, + SpanAnnotationORM, + SpanAnnotationORM.annotation_document_id + == AnnotationDocumentORM.id, isouter=True, ) query_builder._join_subquery( - SpanAnnotationORM.code, + SpanTextORM, + SpanTextORM.id == SpanAnnotationORM.span_text_id, isouter=True, ) diff --git a/backend/src/app/core/search/sent_anno_search/sent_anno_search_columns.py b/backend/src/app/core/search/sent_anno_search/sent_anno_search_columns.py index a0d65c4d7..45639802b 100644 --- a/backend/src/app/core/search/sent_anno_search/sent_anno_search_columns.py +++ b/backend/src/app/core/search/sent_anno_search/sent_anno_search_columns.py @@ -106,8 +106,15 @@ def add_subquery_filter_statements(self, query_builder: SearchBuilder): label=SentAnnoColumns.DOCUMENT_TAG_ID_LIST.value, ) ) - query_builder._join_subquery(SentenceAnnotationORM.annotation_document) - query_builder._join_subquery(AnnotationDocumentORM.source_document) + query_builder._join_subquery( + AnnotationDocumentORM, + AnnotationDocumentORM.id + == SentenceAnnotationORM.annotation_document_id, + ) + query_builder._join_subquery( + SourceDocumentORM, + SourceDocumentORM.id == AnnotationDocumentORM.source_document_id, + ) query_builder._join_subquery( SourceDocumentORM.document_tags, isouter=True ) diff --git a/backend/src/app/core/search/span_anno_search/span_anno_search_columns.py b/backend/src/app/core/search/span_anno_search/span_anno_search_columns.py index b5a750080..365618bc8 100644 --- a/backend/src/app/core/search/span_anno_search/span_anno_search_columns.py +++ b/backend/src/app/core/search/span_anno_search/span_anno_search_columns.py @@ -105,8 +105,15 @@ def add_subquery_filter_statements(self, query_builder: SearchBuilder): label=SpanColumns.DOCUMENT_TAG_ID_LIST.value, ) ) - query_builder._join_subquery(SpanAnnotationORM.annotation_document) - query_builder._join_subquery(AnnotationDocumentORM.source_document) + query_builder._join_subquery( + AnnotationDocumentORM, + AnnotationDocumentORM.id + == SpanAnnotationORM.annotation_document_id, + ) + query_builder._join_subquery( + SourceDocumentORM, + SourceDocumentORM.id == AnnotationDocumentORM.source_document_id, + ) query_builder._join_subquery( SourceDocumentORM.document_tags, isouter=True ) From f523396b1e081c52363e9e69537381dec74bc8d6 Mon Sep 17 00:00:00 2001 From: Tim Fischer Date: Tue, 11 Feb 2025 12:57:32 +0000 Subject: [PATCH 2/3] code search now includes SentenceAnnotations --- .../timeline_analysis_columns.py | 15 ++++++++++--- .../word_frequency_columns.py | 16 +++++++++++--- backend/src/app/core/db/sql_utils.py | 13 ++++++++++++ .../search/sdoc_search/sdoc_search_columns.py | 21 +++++++++++++++---- 4 files changed, 55 insertions(+), 10 deletions(-) diff --git a/backend/src/app/core/analysis/timeline_analysis/timeline_analysis_columns.py b/backend/src/app/core/analysis/timeline_analysis/timeline_analysis_columns.py index 42b8b10f8..ddbf8edb0 100644 --- a/backend/src/app/core/analysis/timeline_analysis/timeline_analysis_columns.py +++ b/backend/src/app/core/analysis/timeline_analysis/timeline_analysis_columns.py @@ -3,10 +3,11 @@ from app.core.data.orm.annotation_document import AnnotationDocumentORM from app.core.data.orm.document_tag import DocumentTagORM +from app.core.data.orm.sentence_annotation import SentenceAnnotationORM from app.core.data.orm.source_document import SourceDocumentORM from app.core.data.orm.span_annotation import SpanAnnotationORM from app.core.data.orm.span_text import SpanTextORM -from app.core.db.sql_utils import aggregate_ids +from app.core.db.sql_utils import aggregate_ids, aggregate_two_ids from app.core.search.column_info import AbstractColumns from app.core.search.filtering_operators import FilterOperator, FilterValueType from app.core.search.search_builder import SearchBuilder @@ -98,8 +99,9 @@ def add_subquery_filter_statements(self, query_builder: SearchBuilder): ) case TimelineAnalysisColumns.CODE_ID_LIST: query_builder._add_subquery_column( - aggregate_ids( + aggregate_two_ids( SpanAnnotationORM.code_id, + SentenceAnnotationORM.code_id, label=TimelineAnalysisColumns.CODE_ID_LIST.value, ) ) @@ -110,7 +112,14 @@ def add_subquery_filter_statements(self, query_builder: SearchBuilder): ) query_builder._join_subquery( SpanAnnotationORM, - AnnotationDocumentORM.source_document_id == SourceDocumentORM.id, + SpanAnnotationORM.annotation_document_id + == AnnotationDocumentORM.id, + isouter=True, + ) + query_builder._join_subquery( + SentenceAnnotationORM, + SentenceAnnotationORM.annotation_document_id + == AnnotationDocumentORM.id, isouter=True, ) diff --git a/backend/src/app/core/analysis/word_frequency_analysis/word_frequency_columns.py b/backend/src/app/core/analysis/word_frequency_analysis/word_frequency_columns.py index 298838022..f83cb5b0b 100644 --- a/backend/src/app/core/analysis/word_frequency_analysis/word_frequency_columns.py +++ b/backend/src/app/core/analysis/word_frequency_analysis/word_frequency_columns.py @@ -3,11 +3,12 @@ from app.core.data.orm.annotation_document import AnnotationDocumentORM from app.core.data.orm.document_tag import DocumentTagORM +from app.core.data.orm.sentence_annotation import SentenceAnnotationORM from app.core.data.orm.source_document import SourceDocumentORM from app.core.data.orm.span_annotation import SpanAnnotationORM from app.core.data.orm.span_text import SpanTextORM from app.core.data.orm.word_frequency import WordFrequencyORM -from app.core.db.sql_utils import aggregate_ids +from app.core.db.sql_utils import aggregate_ids, aggregate_two_ids from app.core.search.column_info import AbstractColumns from app.core.search.filtering_operators import FilterOperator, FilterValueType from app.core.search.search_builder import SearchBuilder @@ -154,8 +155,9 @@ def add_subquery_filter_statements(self, query_builder: SearchBuilder): ) case WordFrequencyColumns.CODE_ID_LIST: query_builder._add_subquery_column( - aggregate_ids( + aggregate_two_ids( SpanAnnotationORM.code_id, + SentenceAnnotationORM.code_id, label=WordFrequencyColumns.CODE_ID_LIST.value, ) ) @@ -166,9 +168,17 @@ def add_subquery_filter_statements(self, query_builder: SearchBuilder): ) query_builder._join_subquery( SpanAnnotationORM, - AnnotationDocumentORM.source_document_id == SourceDocumentORM.id, + SpanAnnotationORM.annotation_document_id + == AnnotationDocumentORM.id, isouter=True, ) + query_builder._join_subquery( + SentenceAnnotationORM, + SentenceAnnotationORM.annotation_document_id + == AnnotationDocumentORM.id, + isouter=True, + ) + case WordFrequencyColumns.USER_ID_LIST: query_builder._add_subquery_column( aggregate_ids( diff --git a/backend/src/app/core/db/sql_utils.py b/backend/src/app/core/db/sql_utils.py index 4938b8663..29b97f59b 100644 --- a/backend/src/app/core/db/sql_utils.py +++ b/backend/src/app/core/db/sql_utils.py @@ -9,3 +9,16 @@ def aggregate_ids(column: InstrumentedAttribute, label: str): None, type_=ARRAY(Integer), ).label(label) + + +def aggregate_two_ids( + column1: InstrumentedAttribute, column2: InstrumentedAttribute, label: str +): + return func.array_remove( + func.array_cat( + array_agg(func.distinct(column1), type_=ARRAY(Integer)), + array_agg(func.distinct(column2), type_=ARRAY(Integer)), + ), + None, + type_=ARRAY(Integer), + ).label(label) diff --git a/backend/src/app/core/search/sdoc_search/sdoc_search_columns.py b/backend/src/app/core/search/sdoc_search/sdoc_search_columns.py index e54aa3e9e..8a8bbf0f0 100644 --- a/backend/src/app/core/search/sdoc_search/sdoc_search_columns.py +++ b/backend/src/app/core/search/sdoc_search/sdoc_search_columns.py @@ -3,10 +3,14 @@ from app.core.data.orm.annotation_document import AnnotationDocumentORM from app.core.data.orm.document_tag import DocumentTagORM +from app.core.data.orm.sentence_annotation import SentenceAnnotationORM from app.core.data.orm.source_document import SourceDocumentORM from app.core.data.orm.span_annotation import SpanAnnotationORM from app.core.data.orm.span_text import SpanTextORM -from app.core.db.sql_utils import aggregate_ids +from app.core.db.sql_utils import ( + aggregate_ids, + aggregate_two_ids, +) from app.core.search.column_info import AbstractColumns from app.core.search.filtering_operators import FilterOperator, FilterValueType from app.core.search.search_builder import SearchBuilder @@ -109,8 +113,10 @@ def add_subquery_filter_statements(self, query_builder: SearchBuilder): ) case SdocColumns.CODE_ID_LIST: query_builder._add_subquery_column( - aggregate_ids( - SpanAnnotationORM.code_id, label=SdocColumns.CODE_ID_LIST.value + aggregate_two_ids( + SpanAnnotationORM.code_id, + SentenceAnnotationORM.code_id, + label=SdocColumns.CODE_ID_LIST.value, ) ) query_builder._join_subquery( @@ -120,7 +126,14 @@ def add_subquery_filter_statements(self, query_builder: SearchBuilder): ) query_builder._join_subquery( SpanAnnotationORM, - AnnotationDocumentORM.source_document_id == SourceDocumentORM.id, + SpanAnnotationORM.annotation_document_id + == AnnotationDocumentORM.id, + isouter=True, + ) + query_builder._join_subquery( + SentenceAnnotationORM, + SentenceAnnotationORM.annotation_document_id + == AnnotationDocumentORM.id, isouter=True, ) From 3c414d2c0daf73440f307c411ac09ea2f00cb67c Mon Sep 17 00:00:00 2001 From: Tim Fischer Date: Tue, 11 Feb 2025 13:06:17 +0000 Subject: [PATCH 3/3] improved search statistic code query --- .../core/analysis/search_statistics/search_statistics.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/backend/src/app/core/analysis/search_statistics/search_statistics.py b/backend/src/app/core/analysis/search_statistics/search_statistics.py index 8d38e83e0..6e0b97808 100644 --- a/backend/src/app/core/analysis/search_statistics/search_statistics.py +++ b/backend/src/app/core/analysis/search_statistics/search_statistics.py @@ -6,7 +6,6 @@ from app.core.data.crud.project_metadata import crud_project_meta from app.core.data.dto.search_stats import KeywordStat, SpanEntityStat, TagStat from app.core.data.orm.annotation_document import AnnotationDocumentORM -from app.core.data.orm.code import CodeORM from app.core.data.orm.document_tag import ( DocumentTagORM, SourceDocumentDocumentTagLinkTable, @@ -141,10 +140,9 @@ def compute_code_statistics( ) .join(SpanTextORM.span_annotations) .join(SpanAnnotationORM.annotation_document) - .join(SpanAnnotationORM.code) .group_by(SpanTextORM.id) .filter( - CodeORM.id == code_id, + SpanAnnotationORM.code_id == code_id, AnnotationDocumentORM.source_document_id.in_(list(sdoc_ids)), ) .order_by(count.desc()) @@ -169,7 +167,7 @@ def compute_code_statistics( .join(SpanAnnotationORM.code) .group_by(SpanTextORM.id) .filter( - CodeORM.id == code_id, + SpanAnnotationORM.code_id == code_id, SpanTextORM.id.in_(span_text_ids), ) .order_by(func.array_position(span_text_ids, SpanTextORM.id))