Skip to content

Commit

Permalink
check if source is already transcribed
Browse files Browse the repository at this point in the history
Use btctranscripts.com/status.json to check if a given source is already
transcribed before processing
  • Loading branch information
kouloumos committed Nov 24, 2023
1 parent dd18f6e commit 64dbbe0
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 8 deletions.
37 changes: 29 additions & 8 deletions app/transcription.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from app.transcript import Transcript, Source, Audio, Video, Playlist, RSS
from app import __app_name__, __version__, application
from app.utils import write_to_json
from app.utils import write_to_json, get_existing_media
from app.logging import get_logger


Expand All @@ -35,6 +35,7 @@ def __init__(self, model="tiny", chapters=False, pr=False, summarize=False, deep
self.queue = queue if not test_mode else False
# during testing we need to create the markdown for validation purposes
self.markdown = markdown or test_mode
self.existing_media = None
self.test_mode = test_mode
self.logger = get_logger()
self.tmp_dir = working_dir if working_dir is not None else tempfile.mkdtemp()
Expand Down Expand Up @@ -113,13 +114,19 @@ def check_if_youtube(source: Source):
except Exception as e:
raise Exception(f"Error from assigning source: {e}")

def add_transcription_source(self, source_file, loc="misc", title=None, date=None, tags=[], category=[], speakers=[], preprocess=True, youtube_metadata=None, chapters=None, nocheck=False):
def add_transcription_source(self, source_file, loc="misc", title=None, date=None, tags=[], category=[], speakers=[], preprocess=True, youtube_metadata=None, chapters=None, nocheck=False, excluded_media=[]):
"""Add a source for transcription"""
transcription_sources = {"added": [], "exist": []}
# check if source is a local file
local = False
if os.path.isfile(source_file):
local = True
if not nocheck and not local and self.existing_media is None and not self.test_mode:
self.existing_media = get_existing_media()
# combine existing media from btctranscripts.com with excluded media given from source
excluded_media = {value: True for value in excluded_media}
if self.existing_media is not None:
excluded_media.update(self.existing_media)
# initialize source
source = self._initialize_source(
source=Source(source_file, loc, local, title, date,
Expand All @@ -130,18 +137,32 @@ def add_transcription_source(self, source_file, loc="misc", title=None, date=Non
if source.type == "playlist":
# add a transcript for each source/video in the playlist
for video in source.videos:
transcription_sources['added'].append(video)
self.transcripts.append(Transcript(video, self.test_mode))
if video.media not in excluded_media:
transcription_sources['added'].append(video)
self.transcripts.append(Transcript(video, self.test_mode))
else:
transcription_sources['exist'].append(video)
elif source.type == 'rss':
# add a transcript for each source/audio in the rss feed
for entry in source.entries:
transcription_sources['added'].append(entry)
self.transcripts.append(Transcript(entry, self.test_mode))
if entry.media not in excluded_media:
transcription_sources['added'].append(entry)
self.transcripts.append(Transcript(entry, self.test_mode))
else:
transcription_sources['exist'].append(entry)
elif source.type in ['audio', 'video']:
transcription_sources['added'].append(source)
self.transcripts.append(Transcript(source, self.test_mode))
if source.media not in excluded_media:
transcription_sources['added'].append(source)
self.transcripts.append(Transcript(source, self.test_mode))
self.logger.info(f"Source added for transcription: {source.title}")
else:
transcription_sources['exist'].append(source)
self.logger.info(f"Source already exists: {source.title}")
else:
raise Exception(f"Invalid source: {source_file}")
if source.type in ['playlist', 'rss']:
self.logger.info(
f"{source.title}: sources added for transcription: {len(transcription_sources['added'])} (Ignored: {len(transcription_sources['exist'])} sources)")
return transcription_sources

def push_to_queue(self, transcript: Transcript, payload=None):
Expand Down
51 changes: 51 additions & 0 deletions app/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,13 @@
import re
from datetime import datetime

import requests

from app.logging import get_logger

logger = get_logger()


def slugify(text):
return re.sub(r'\W+', '-', text).strip('-').lower()

Expand All @@ -17,3 +24,47 @@ def write_to_json(json_data, output_dir, filename, add_timestamp=True):
with open(file_path, "w") as json_file:
json.dump(json_data, json_file, indent=4)
return file_path


def get_status():
"""Helper method to fetch and store status.json locally"""
STATUS_FILE_PATH = "status.json" # the file path for storing the status locally
try:
source = STATUS_FILE_PATH
if os.path.exists(STATUS_FILE_PATH):
# If the file exists locally, load the data from the file
with open(STATUS_FILE_PATH, "r") as file:
data = json.load(file)
else:
# If the file doesn't exist locally, fetch it from the remote URL
url = "http://btctranscripts.com/status.json"
source = url
response = requests.get(url)
if response.status_code == 200:
data = response.json()
# Store the fetched data locally
with open(STATUS_FILE_PATH, "w") as file:
json.dump(data, file)
else:
raise Exception(f"Status code: {response.status_code}")

return data, source
except Exception as e:
logger.error(f"Error fetching status data: {e}")
return None


def get_existing_media():
"""Helper method to create a dictionary with all the existing media from btctranscripts.com
It can be used to quickly check if a source is already transcribed"""
try:
data, source = get_status() # Fetch status data
if data:
logger.info(
f"Fetched {len(data['existing']['media'])} existing media sources from {source}")
return {value: True for value in data["existing"]["media"]}
else:
return {}
except Exception as e:
logger.error(f"Error fetching media data: {e}")
return {}

0 comments on commit 64dbbe0

Please sign in to comment.