diff --git a/Readme.md b/Readme.md index ba40e7c..335aeb8 100644 --- a/Readme.md +++ b/Readme.md @@ -106,8 +106,7 @@ To configure the transcription process, you can use the following flags: - `-D` or `--deepgram`: Use deepgram for transcription, instead of using the whisper model [default: False] - `-M` or `--diarize`: Supply this flag if you have multiple speakers AKA want to diarize the content [only available with deepgram] - `-S` or `--summarize`: Summarize the transcript [only available with deepgram] -- `-C` or `--chapters`: For YouTube videos, include the YouTube chapters and timestamps in the resulting transcript. -- `--github`: Push transcripts to a new branch on the origin bitcointranscripts repo +- `--github`: Specify the GitHub operation mode - `-u` or `--upload`: Upload processed model files to AWS S3 - `--markdown`: Save the resulting transcript to a markdown format supported by bitcointranscripts - `--noqueue`: Do not push the resulting transcript to the Queuer, instead store the payload in a json file diff --git a/app/transcription.py b/app/transcription.py index 94ea1d1..0af3951 100644 --- a/app/transcription.py +++ b/app/transcription.py @@ -1,21 +1,20 @@ -import json -import logging import os import shutil import random -import re import subprocess import tempfile -import time -from datetime import datetime from dotenv import dotenv_values -import pytube -from pytube.exceptions import PytubeError -import requests import yt_dlp -from app.transcript import Transcript, Source, Audio, Video, Playlist, RSS +from app.transcript import ( + Transcript, + Source, + Audio, + Video, + Playlist, + RSS +) from app import ( __app_name__, __version__, @@ -25,14 +24,17 @@ ) from app.logging import get_logger from app.queuer import Queuer -from app.types import PostprocessOutput +from app.types import ( + GitHubMode, + PostprocessOutput +) class Transcription: def __init__( self, model="tiny", - github=False, + github: GitHubMode = "none", summarize=False, deepgram=False, diarize=False, @@ -77,15 +79,15 @@ def _create_subdirectory(self, subdir_name): os.makedirs(subdir_path) return subdir_path - def __configure_target_repo(self, github): - if not github: + def __configure_target_repo(self, github: GitHubMode): + if github == "none": return None config = dotenv_values(".env") git_repo_dir = config.get("BITCOINTRANSCRIPTS_DIR") if not git_repo_dir: raise Exception( "To push to GitHub you need to define a 'BITCOINTRANSCRIPTS_DIR' in your .env file") - return None + self.github = github return git_repo_dir def __configure_review_flag(self, needs_review): @@ -218,7 +220,8 @@ def add_transcription_source( tags, category, speakers, preprocess, link), youtube_metadata=youtube_metadata, chapters=chapters) - self.logger.info(f"Detected source: {source}") + self.logger.debug(f"Detected source: {source}") + if source.type == "playlist": # add a transcript for each source/video in the playlist for video in source.videos: @@ -306,17 +309,22 @@ def start(self, test_transcript=None): def push_to_github(self, outputs: list[PostprocessOutput]): # Change to the directory where your Git repository is located os.chdir(self.bitcointranscripts_dir) - # Fetch the latest changes from the remote repository - subprocess.run(['git', 'fetch', 'origin', 'master']) - # Create a new branch from the fetched 'origin/master' - branch_name = f"{self.transcript_by}-{''.join(random.choices('0123456789', k=6))}" - subprocess.run(['git', 'checkout', '-b', branch_name, 'origin/master']) + if self.github == "remote": + # Fetch the latest changes from the remote repository + subprocess.run(['git', 'fetch', 'origin', 'master']) + # Create a new branch from the fetched 'origin/master' + branch_name = f"{self.transcript_by}-{''.join(random.choices('0123456789', k=6))}" + subprocess.run( + ['git', 'checkout', '-b', branch_name, 'origin/master']) + # For each output with markdown, create a new commit in the new branch for output in outputs: if output.get('markdown'): markdown_file = output['markdown'] destination_path = os.path.join( self.bitcointranscripts_dir, output["transcript"].source.loc) + # Create the destination directory if it doesn't exist + os.makedirs(destination_path, exist_ok=True) # Ensure the markdown file exists before copying if os.path.exists(markdown_file): shutil.copy(markdown_file, destination_path) @@ -328,11 +336,12 @@ def push_to_github(self, outputs: list[PostprocessOutput]): else: print(f"Markdown file {markdown_file} does not exist.") - # Push the branch to the remote repository - subprocess.run(['git', 'push', 'origin', branch_name]) - # Delete branch locally - subprocess.run(['git', 'checkout', 'master']) - subprocess.run(['git', 'branch', '-D', branch_name]) + if self.github == "remote": + # Push the branch to the remote repository + subprocess.run(['git', 'push', 'origin', branch_name]) + # Delete branch locally + subprocess.run(['git', 'checkout', 'master']) + subprocess.run(['git', 'branch', '-D', branch_name]) def write_to_markdown_file(self, transcript: Transcript, output_dir): """Writes transcript to a markdown file and returns its absolute path diff --git a/app/types.py b/app/types.py index d2eb949..f59e439 100644 --- a/app/types.py +++ b/app/types.py @@ -1,10 +1,12 @@ from typing import ( + Literal, TypedDict, Optional ) from app.transcript import Transcript +GitHubMode = Literal["remote", "local", "none"] class PostprocessOutput(TypedDict): transcript: Transcript diff --git a/transcriber.py b/transcriber.py index 1e4e063..663b92b 100644 --- a/transcriber.py +++ b/transcriber.py @@ -8,13 +8,13 @@ from app import ( __app_name__, __version__, - application, utils ) from app.commands import queue +from app.logging import configure_logger, get_logger from app.transcript import Transcript from app.transcription import Transcription -from app.logging import configure_logger, get_logger +from app.types import GitHubMode logger = get_logger() @@ -91,9 +91,13 @@ def print_help(ctx, param, value): ) github = click.option( "--github", - is_flag=True, - default=False, - help="Push transcripts to a new branch on the origin bitcointranscripts repo", + type=click.Choice(["remote", "local", "none"]), + default="none", + help=("Specify the GitHub operation mode." + "'remote': Create a new branch, push changes to it, and push it to the origin bitcointranscripts repo. " + "'local': Commit changes to the current local branch without pushing to the remote repo." + "'none': Do not perform any GitHub operations."), + show_default=True ) upload_to_s3 = click.option( "-u", @@ -213,7 +217,7 @@ def transcribe( tags: list, speakers: list, category: list, - github: bool, + github: GitHubMode, deepgram: bool, summarize: bool, diarize: bool, @@ -317,6 +321,7 @@ def preprocess( configure_logger(log_level=logging.INFO) logger.info(f"Preprocessing sources...") transcription = Transcription( + queue=False, batch_preprocessing_output=not no_batched_output) if source.endswith(".json"): transcription.add_transcription_source_JSON(source, nocheck=nocheck) @@ -361,7 +366,7 @@ def preprocess( def postprocess( metadata_json_file, service, - github: bool, + github: GitHubMode, upload: bool, markdown: bool, noqueue: bool, @@ -406,7 +411,8 @@ def postprocess( f"{service}_output"] transcript_to_postprocess.result = transcription.service.finalize_transcript( transcript_to_postprocess) - postprocessed_transcript = transcription.postprocess(transcript_to_postprocess) + postprocessed_transcript = transcription.postprocess( + transcript_to_postprocess) if transcription.bitcointranscripts_dir: transcription.push_to_github([postprocessed_transcript])