Skip to content

Commit

Permalink
modify json
Browse files Browse the repository at this point in the history
  • Loading branch information
duyguHsnHsn committed Feb 5, 2024
1 parent a654a05 commit 74f2e62
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 29 deletions.
22 changes: 7 additions & 15 deletions examples/confluence-reader/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,28 +25,20 @@ It follows this structured format:
"metadata": {
"title": "Page Title",
"id": "Page ID",
"source": "Source URL"
"source": "Source URL",
"deleted": false
},
"page_content": "Page Content Text",
"deleted": false
"data": "Page Content Text"
},
{
"metadata": {
"title": "Another Page Title",
"id": "Another Page ID",
"source": "Another Source URL"
"source": "Another Source URL",
"deleted": true
},
"page_content": "Another Page Content Text",
"deleted": false
},
{
"metadata": {
"title": "Yet Another Page Title",
"id": "Yet Another Page ID",
"source": "Yet Another Source URL"
},
"page_content": "Yet Another Page Content Text",
"deleted": false
"data": "Another Page Content Text"
}
]

```
1 change: 1 addition & 0 deletions examples/confluence-reader/confluence_data.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[]
19 changes: 8 additions & 11 deletions examples/confluence-reader/confluence_document.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,36 @@

class ConfluenceDocument:
def __init__(self, metadata, page_content, deleted=False):
def __init__(self, metadata, data, deleted=False):
"""
Initializes a ConfluenceDocument instance.
:param metadata: A dictionary containing metadata about the Confluence document.
Expected to contain 'title', 'id', and 'source'.
:param page_content: A string representing the content of the Confluence page.
:param deleted: A boolean indicating whether the document is considered deleted.
'deleted' key will be added to indicate if the document is considered deleted.
:param data: A string representing the content of the Confluence page.
"""
self.validate_metadata(metadata)
metadata['deleted'] = deleted
self.metadata = metadata
self.page_content = page_content
self.deleted = deleted
self.data = data

def serialize(self):
"""
Serializes the ConfluenceDocument instance into a dictionary.
"""
return {
'metadata': self.metadata,
'page_content': self.page_content,
'deleted': self.deleted
'data': self.data
}

@staticmethod
def validate_metadata(metadata):
"""
Validates the metadata dictionary to ensure it contains required keys.
Validates the metadata dictionary to ensure it contains required keys plus checks for 'deleted'.
:param metadata: A dictionary containing metadata about the Confluence document.
:raises ValueError: If metadata does not contain the required keys.
:raises ValueError: If metadata does not contain the required keys ('title', 'id', 'source').
"""
required_keys = {'title', 'id', 'source'}
if not required_keys.issubset(metadata):
missing_keys = required_keys - metadata.keys()
raise ValueError(f"Metadata is missing required keys: {missing_keys}")

6 changes: 3 additions & 3 deletions examples/confluence-reader/fetch_confluence_space.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def update_saved_documents(file_path, new_docs):
existing_docs = read_json_file(file_path) or []

if isinstance(existing_docs, list) and existing_docs and isinstance(existing_docs[0], dict):
existing_docs = [ConfluenceDocument(doc['metadata'], doc['page_content'], doc.get('deleted', False)) for doc in existing_docs]
existing_docs = [ConfluenceDocument(doc['metadata'], doc['data'], doc['metadata'].get('deleted', False)) for doc in existing_docs]

existing_docs_dict = {doc.metadata['id']: doc for doc in existing_docs}

Expand All @@ -54,13 +54,13 @@ def flag_deleted_pages(file_path, current_confluence_documents):
log.error("Existing documents not found. Exiting.")
return

existing_docs = [ConfluenceDocument(doc['metadata'], doc['page_content'], doc.get('deleted', False)) for doc in existing_docs]
existing_docs = [ConfluenceDocument(doc['metadata'], doc['data'], doc['metadata'].get('deleted', False)) for doc in existing_docs]

current_page_ids = {doc.metadata['id'] for doc in current_confluence_documents}

for doc in existing_docs:
if doc.metadata['id'] not in current_page_ids:
doc.deleted = True
doc.metadata['deleted'] = True

serialized_docs = [doc.serialize() for doc in existing_docs]
write_json_file(file_path, serialized_docs)
Expand Down

0 comments on commit 74f2e62

Please sign in to comment.