-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathSceneAnalyzer.py
397 lines (333 loc) · 18.1 KB
/
SceneAnalyzer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
import cv2
import os
import base64
import requests
from moviepy.editor import VideoFileClip
import logging
import json
import sys
import threading
logging.getLogger('moviepy').setLevel(logging.ERROR)
import time
from functools import wraps
from dotenv import load_dotenv
#final_arr=[]
load_dotenv()
speech_key=os.environ["AZURE_SPEECH_KEY"]
#azure whisper key *
AZ_WHISPER=os.environ["AZURE_WHISPER_KEY"]
#Azure whisper deployment name *
azure_whisper_deployment=os.environ["AZURE_WHISPER_DEPLOYMENT"]
#Azure whisper endpoint (just name) *
azure_whisper_endpoint=os.environ["AZURE_WHISPER_ENDPOINT"]
#azure openai vision api key *
azure_vision_key=os.environ["AZURE_VISION_KEY"]
#Audio API type (OpenAI, Azure)*
audio_api_type=os.environ["AUDIO_API_TYPE"]
#GPT4 vision APi type (OpenAI, Azure)*
vision_api_type=os.environ["VISION_API_TYPE"]
#OpenAI API Key*
openai_api_key=os.environ["OPENAI_API_KEY"]
#GPT4 Azure vision API Deployment Name*
vision_deployment=os.environ["VISION_DEPLOYMENT_NAME"]
#GPT
vision_endpoint=os.environ["VISION_ENDPOINT"]
def log_execution_time(func):
@wraps(func) # Preserves the name and docstring of the decorated function
def wrapper(*args, **kwargs):
start_time = time.time()
result = func(*args, **kwargs)
end_time = time.time()
print(f"Function {func.__name__} took {end_time - start_time:.4f} seconds to complete.")
return result
return wrapper
class Spinner:
def __init__(self, message="Processing..."):
self.spinner_symbols = "|/-\\"
self.idx = 0
self.message = message
self.stop_spinner = False
def spinner_task(self):
while not self.stop_spinner:
sys.stdout.write(f"\r{self.message} {self.spinner_symbols[self.idx % len(self.spinner_symbols)]}")
sys.stdout.flush()
time.sleep(0.1)
self.idx += 1
def start(self):
self.stop_spinner = False
self.thread = threading.Thread(target=self.spinner_task)
self.thread.start()
def stop(self):
self.stop_spinner = True
self.thread.join()
sys.stdout.write('\r' + ' '*(len(self.message)+2) + '\r') # Erase spinner
sys.stdout.flush()
chapter_summary = {}
@log_execution_time
def AnalyzeVideo(vp,fi,fpi):
# Constants
video_path = vp # Replace with your video path
output_frame_dir = 'frames'
output_audio_dir = 'audio'
transcriptions_dir = 'transcriptions'
frame_interval = fi # seconds 180
frames_per_interval = fpi
# Ensure output directories exist
for directory in [output_frame_dir, output_audio_dir, transcriptions_dir]:
if not os.path.exists(directory):
os.makedirs(directory)
# Encode image to base64
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
def send_post_request(resource_name, deployment_name, api_key,data):
url = f"https://{resource_name}.openai.azure.com/openai/deployments/{deployment_name}/chat/completions?api-version=2023-12-01-preview"
headers = {
"Content-Type": "application/json",
"api-key": api_key
}
response = requests.post(url, headers=headers, data=json.dumps(data))
return response
# GPT-4 vision analysis function
def gpt4_vision_analysis(image_path, api_key, summary, trans):
cont=[
{
"type": "text",
"text": f"Current Summary up to last {frame_interval} seconds: "+summary
},
{
"type": "text",
"text": f"Audio Transcription for last {frame_interval} seconds: "+trans
},
{
"type": "text",
"text": f"Next are the {frames_per_interval} frames from the last {frame_interval} seconds of the video:"
}
]
for img in image_path:
base64_image = encode_image(img)
cont.append( {
"type": "text",
"text": f"Below this is {img} (s is for seconds). use this to provide timestamps and understand time"
})
cont.append({
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
})
json_form=str([json.dumps({"title":"Chapter 1: A new beginning","start_frame":"0.0s","end_frame":"253.55s","scenes":[{"title":"Scene 1: it started","description":"The thing happened","start_frame":"0.0s","end_frame":"30.0s"},{"title":"Scene 2: around again","description":"Another thing happened","start_frame":"30.0s","end_frame":"75.0s"}]}),json.dumps({"title":"Chapter 2: Next steps","start_frame":"253.55s","end_frame":"604.90s","scenes":[{"title":"Scene 1: new hope","description":"The thing happened","start_frame":"275.0s","end_frame":"310.0s"},{"title":"Scene 2: bad days","description":"Another thing happened","start_frame":"310.0s","end_frame":"360.0s"}]})])
if(vision_api_type=="Azure"):
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}"
}
payload2 = {
"messages": [
{
"role": "system",
"content": f"""You are VideoAnalyzerGPT. Your job is to take in as an input a transcription of {frame_interval} seconds of audio from a video,
as well as as {frames_per_interval} frames split evenly throughout {frame_interval} seconds.
You are then provided a Current Chapter Breakdown of the video so far (3 most recent chapters only),
which is generated from your analysis of each frame ({frames_per_interval} in total),
as well as the in-between audio, until iteratively we have a full breakdown of all the chapters of the video.
As the main intelligence of this system, you are responsible for building the Current Chapter Breakdown using both the audio you are being provided
via transcription, as well as the image of the frame.
Always and only return as your output the updated Current Chapter Breakdown in format ```{json_form}```.
(the format is a template, make sure to start at chapter 1 in your generation if there is not one already.)
The start and end frames represent the times that a scene and chapter starts and ends, use the data provided above each image, and in the audio to service this feature.
You can think through your responses step by step. Determine the Chapters contextually using the audio and analyzed video frames.
You dont need to provide a new chapter for every frame, the chapters should represent overarching themes and moments.
Always provide new or updated chapters in your response, and consider them all for editing purposes on each pass,
the Chapter Response Should be a JSON object array, with each chapter being a json object, with each key being a scene title in the chapter,
with the value being an array of information about the scene, with the first key in each object being the title of the chapter.
The thresholds required for a new chapter are: Major Thematic Change, Major Story Change, Major Setting Change.
Think through your Chapter assignment process step by step before providing the response JSON.
Do not make up timestamps, use the ones provided with each frame.
Provide back the response as JSON, and always and only return back JSON following the format specified.
Scenes in a given chapter must be contiguous
"""
},
{
"role": "user",
"content": cont
}
],
"max_tokens": 4000,
"seed": 42
}
response=send_post_request(vision_endpoint,vision_deployment,azure_vision_key,payload2)
else:
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}"
}
payload = {
"model": "gpt-4-vision-preview",
"messages": [
{
"role": "system",
"content": f"""You are VideoAnalyzerGPT. Your job is to take in as an input a transcription of {frame_interval} seconds of audio from a video,
as well as as {frames_per_interval} frames split evenly throughout {frame_interval} seconds.
You are then provided a Current Chapter Breakdown of the video so far (3 most recent chapters only),
which is generated from your analysis of each frame ({frames_per_interval} in total),
as well as the in-between audio, until iteratively we have a full breakdown of all the chapters of the video.
As the main intelligence of this system, you are responsible for building the Current Chapter Breakdown using both the audio you are being provided
via transcription, as well as the image of the frame.
Always and only return as your output the updated Current Chapter Breakdown in format ```{json_form}```.
(the format is a template, make sure to start at chapter 1 in your generation if there is not one already.)
The start and end frames represent the times that a scene and chapter starts and ends, use the data provided above each image, and in the audio to service this feature.
You can think through your responses step by step. Determine the Chapters contextually using the audio and analyzed video frames.
You dont need to provide a new chapter for every frame, the chapters should represent overarching themes and moments.
Always provide new or updated chapters in your response, and consider them all for editing purposes on each pass,
the Chapter Response Should be a JSON object array, with each chapter being a json object, with each key being a scene title in the chapter,
with the value being an array of information about the scene, with the first key in each object being the title of the chapter.
The thresholds required for a new chapter are: Major Thematic Change, Major Story Change, Major Setting Change.
Think through your Chapter assignment process step by step before providing the response JSON.
Do not make up timestamps, use the ones provided with each frame.
Provide back the response as JSON, and always and only return back JSON following the format specified.
Scenes in a given chapter must be contiguous
""" },
{
"role": "user",
"content": cont
}
],
"max_tokens": 4000,
"seed": 42,
}
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
return response.json()
def update_chapter_summary(new_json_string):
global chapter_summary
if new_json_string.startswith('json'):
# Remove the first occurrence of 'json' from the response text
new_json_string = new_json_string[4:]
else:
new_json_string = new_json_string
# Assuming new_json_string is the JSON format string returned from your API call
new_chapters_list = json.loads(new_json_string)
# Iterate over the list of new chapters
for chapter in new_chapters_list:
chapter_title = chapter['title']
# Update the chapter_summary with the new chapter
chapter_summary[chapter_title] = chapter
# Get keys of the last three chapters
last_three_keys = list(chapter_summary.keys())[-3:]
# Get the last three chapters as an array
last_three_chapters = [chapter_summary[key] for key in last_three_keys]
return last_three_chapters
# Load video
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS) # Frames per second
# Process video
current_frame = 0
current_second = 0
current_summary=""
packet=[]
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
frame_number = 0
# Constants
# Load video audio
video_clip = VideoFileClip(video_path)
video_duration = video_clip.duration # Duration of the video in seconds
# Process video
current_frame = 0 # Current frame initialized to 0
current_second = 0 # Current second initialized to 0
current_summary=""
packet=[]
current_interval_start_second = 0
capture_interval_in_frames = int(fps * frame_interval / frames_per_interval) # Interval in frames to capture the image
spinner = Spinner("Capturing Video and Audio...")
spinner.start()
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
current_second = current_frame / fps
if current_frame % capture_interval_in_frames == 0 and current_frame != 0:
# Extract and save frame
# Save frame at the exact intervals
frame_name = f'frame_at_{current_second}s.jpg'
#print(frame_name)
frame_path = os.path.join(output_frame_dir, frame_name)
cv2.imwrite(frame_path, frame)
packet.append(frame_path)
if len(packet) == frames_per_interval or (current_interval_start_second + frame_interval) < current_second:
audio_name = f'audio_at_{current_interval_start_second}s.mp3'
audio_path = os.path.join(output_audio_dir, audio_name)
audio_clip = video_clip.subclip(current_interval_start_second, min(current_interval_start_second + frame_interval, video_clip.duration)) # Avoid going past the video duration
audio_clip.audio.write_audiofile(audio_path, codec='mp3', verbose=False, logger=None)
#print(f'Extracted audio and frames from {current_interval_start_second} to {min(current_interval_start_second + frame_interval, video_clip.duration)} second.\n')
# TODO: Add code for transcribing audio with OpenAI Whisper API (as shown in previous examples)
spinner.stop()
spinner = Spinner("Transcribing Audio...")
spinner.start()
def transcribe_audio(audio_path, endpoint, api_key, deployment_name):
# url = f"{endpoint}/openai/deployments/{deployment_name}/audio/transcriptions?api-version=2024-02-01"
url = f"https://aoai-sweden-minggu.openai.azure.com/openai/deployments/whisper/audio/transcriptions?api-version=2024-02-01"
print(f"url: {url}")
print(f"api-key: {api_key}")
headers = {
"api-key": api_key,
"Content-Type": "multipart/form-data"
}
json = {
"file": (audio_path.split("/")[-1], open(audio_path, "rb"), "audio/mp3"),
"locale": "en-US",
}
response = requests.post(url, headers=headers, files=json)
return response
if(audio_api_type=="Azure"):
response = transcribe_audio(audio_path,azure_whisper_endpoint,AZ_WHISPER,azure_whisper_deployment)
else:
from openai import OpenAI
client = OpenAI()
audio_file= open(audio_path, "rb")
response = client.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
response_format="verbose_json"
)
current_transcription=""
print(f"response: {response}")
for item in response.segments:
current_transcription+=str(round(item["start"],2))+"s - "+str(round(item["end"],2))+"s: "+item["text"]+"\n"
spinner.stop()
spinner = Spinner("Processing Frames and Audio with AI...")
spinner.start()
# Simulate some long-running process
# Analyze frame with GPT-4 vision
vision_response = gpt4_vision_analysis(packet, openai_api_key, current_summary, current_transcription)
try:
vision_analysis = vision_response["choices"][0]["message"]["content"]
except:
print(vision_response)
try:
chapter_text = str(vision_analysis).split("```")[1]
last_three_chapters = update_chapter_summary(chapter_text)
# Convert the last three chapters back to JSON string to update current_summary
current_summary = json.dumps(last_three_chapters, ensure_ascii=False)
except Exception as e:
print("bad json",str(e))
current_summary=str(vision_analysis)
spinner.stop()
print(f'{json.dumps(current_summary,indent=2)} \n')
spinner = Spinner("Capturing Video and Audio...")
spinner.start()
packet.clear() # Clear packet after analysis
current_interval_start_second += frame_interval # Move to the next set of frames
if current_second >= video_clip.duration:
break
current_frame += 1
current_second = current_frame / fps
#current_second = int(current_frame / fps)
# Release resources
cap.release()
cv2.destroyAllWindows()
print('Extraction, analysis, and transcription completed.')
print("\n\n\n"+json.dumps(chapter_summary,indent=2))
AnalyzeVideo("207566398_test_video.mp4",60,10)
with open('chapterBreakdown.json', 'w') as f:
# Write the data to the file in JSON format
json.dump(chapter_summary, f, indent=4)