-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathActionSummary-incar.py
728 lines (641 loc) · 44.3 KB
/
ActionSummary-incar.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
import cv2
import os
import base64
import requests
from moviepy.editor import VideoFileClip
import logging
import json
import sys
import openai
import threading
from retrying import retry
logging.getLogger('moviepy').setLevel(logging.ERROR)
import time
from functools import wraps
from dotenv import load_dotenv
import time
final_arr=[]
load_dotenv()
#azure speech key
speech_key=os.environ["AZURE_SPEECH_KEY"]
#azure whisper key *
AZ_WHISPER=os.environ["AZURE_WHISPER_KEY"]
#Azure whisper deployment name *
azure_whisper_deployment=os.environ["AZURE_WHISPER_DEPLOYMENT"]
#Azure whisper endpoint (just name) *
azure_whisper_endpoint=os.environ["AZURE_WHISPER_ENDPOINT"]
#azure openai vision api key *
#azure_vision_key=os.environ["AZURE_VISION_KEY"]
#Audio API type (OpenAI, Azure)* c
audio_api_type=os.environ["AUDIO_API_TYPE"]
#GPT4 vision APi type (OpenAI, Azure)*
vision_api_type=os.environ["VISION_API_TYPE"]
#OpenAI API Key*
openai_api_key=os.environ["OPENAI_API_KEY"]
#GPT4 Azure vision API Deployment Name*
vision_deployment=os.environ["VISION_DEPLOYMENT_NAME"]
#GPT
vision_endpoint=os.environ["VISION_ENDPOINT"]
def log_execution_time(func):
@wraps(func) # Preserves the name and docstring of the decorated function
def wrapper(*args, **kwargs):
start_time = time.time()
result = func(*args, **kwargs)
end_time = time.time()
print(f"Function {func.__name__} took {end_time - start_time:.4f} seconds to complete.")
return result
return wrapper
class Spinner:
def __init__(self, message="Processing..."):
self.spinner_symbols = "|/-\\"
self.idx = 0
self.message = message
self.stop_spinner = False
def spinner_task(self):
while not self.stop_spinner:
sys.stdout.write(f"\r{self.message} {self.spinner_symbols[self.idx % len(self.spinner_symbols)]}")
sys.stdout.flush()
time.sleep(0.1)
self.idx += 1
def start(self):
self.stop_spinner = False
self.thread = threading.Thread(target=self.spinner_task)
self.thread.start()
def stop(self):
self.stop_spinner = True
self.thread.join()
sys.stdout.write('\r' + ' '*(len(self.message)+2) + '\r') # Erase spinner
sys.stdout.flush()
chapter_summary = {}
miss_arr=[]
@log_execution_time
def AnalyzeVideo(vp,fi,fpi,face_rec=False):
# Constants
video_path = vp # Replace with your video path
output_frame_dir = 'frames'
output_audio_dir = 'audio'
global_transcript=""
transcriptions_dir = 'transcriptions'
frame_interval = fi # Chunk video evenly into segments by a certain interval, unit: seconds
frames_per_interval = fpi # Number of frames to capture per interval, unit: frames
totalData=""
# Ensure output directories exist
for directory in [output_frame_dir, output_audio_dir, transcriptions_dir]:
if not os.path.exists(directory):
os.makedirs(directory)
# Encode image to base64
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
#GPT 4 Vision Azure helper function
def send_post_request(resource_name, deployment_name, api_key,data):
url = f"https://{resource_name}.openai.azure.com/openai/deployments/{deployment_name}/chat/completions?api-version=2024-08-01-preview" #2024-06-01
headers = {
"Content-Type": "application/json",
"api-key": api_key
}
print(f"resource_name: {resource_name}")
print(f"Sending POST request to {url}")
print(f"Headers: {headers}")
# print(f"Data: {json.dumps(data)}")
print(f"api_key: {api_key}")
response = requests.post(url, headers=headers, data=json.dumps(data))
return response
# GPT-4 vision analysis function
@retry(stop_max_attempt_number=3)
def gpt4_vision_analysis(image_path, api_key, summary, trans):
#array of metadata
cont=[
{
"type": "text",
"text": f"Audio Transcription for last {frame_interval} seconds: "+trans
},
{
"type": "text",
"text": f"Next are the {frames_per_interval} frames from the last {frame_interval} seconds of the video:"
}
]
#adds images and metadata to package
for img in image_path:
base64_image = encode_image(img)
cont.append( {
"type": "text",
"text": f"Below this is {img} (s is for seconds). use this to provide timestamps and understand time"
})
cont.append({
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
"detail":"high"
}
})
#extraction template
json_form=str([json.dumps({"Start_Timestamp":"4.97s","sentiment":"Positive, Negative, or Neutral","End_Timestamp":"16s","scene_theme":"Dramatic","characters":"Characters is an array containing all the characters detected in the current scene.For each character, always provide the seven fields:'is_child','number_of_chilren','current_child','gender','location','wearing_seat_belt','Sleeping'.Example:[Man in hat, woman in jacket,{'is_child':'Yes','number_of_children':2,'current_child':'1','gender':'Male','location':'Rearleft'(distinguish each kid by location),'wearing_seat_belt':'否','Sleeping':'是,因为眼睛闭着,'description':'A boy about 6years old,wearing a blueT-shirt and jeans,sitting in the rear left seat without a seatbelt.'}]","summary":"Summary of what is occuring around this timestamp with actions included, uses both transcript and frames to create full picture, be detailed and attentive, be serious and straightforward in your description.Focus strongly on seatbelt location on the body and actions related to seatbelts.","actions":"Actions extracted via frame analysis","key_objects":"Any objects in the timerange, include colors along with descriptions. all people should be in this, with as much detail as possible extracted from the frame (clothing,colors,age) Be incredibly detailed","key_actions":"action labels extracted from actions, but do not miss any key actions listed in tasks","prediction":"Prediction of what will happen next especially seatbelt related actions or dangerous actions, based on the current scene."}),
json.dumps({"Start_Timestamp":"16s","sentiment":"Positive, Negative, or Neutral","End_Timestamp":"120s","scene_theme":"Emotional, Heartfelt","characters":"Man in hat, woman in jacket","summary":"Summary of what is occuring around this timestamp with actions included, uses both transcript and frames to create full picture, detailed and attentive, be serious and straightforward in your description.","actions":"Actions extracted via frame analysis","key_objects":"Any objects in the timerange, include colors along with descriptions. all people should be in this, all people should be in this, with as much detail as possible extracted from the frame (clothing,colors,age). Be incredibly detailed","key_actions":"only key action labels extracted from actions","prediction":"Prediction of what will happen next, based on the current scene."})])
if(vision_api_type=="Azure"):
print("sending request to gpt-4o")
payload2 = {
"messages": [
{
"role": "system",
"content": f"""You are VideoAnalyzerGPT. Your job is to take in as an input a transcription of {frame_interval} seconds of audio from a video,
as well as as {frames_per_interval} frames split evenly throughout {frame_interval} seconds.
You are then to generate and provide a Current Action Summary (An Action summary of the video,
with actions being added to the summary via the analysis of the frames) of the portion of the video you are considering ({frames_per_interval}
frames over {frame_interval} seconds), which is generated from your analysis of each frame ({frames_per_interval} in total),
as well as the in-between audio, until we have a full action summary of the portion of the video you are considering,
that includes all actions taken by characters in the video as extracted from the frames. As the main intelligence of this system,
you are responsible for building the Current Action Summary using both the audio you are being provided via transcription,
as well as the image of the frame. Always and only return as your output the updated Current Action Summary in format ```{json_form}```.
Make sure **EVERY** field in above json_form is filled out!!!
Do not make up timestamps, use the ones provided with each frame.
Construct each action summary block from mulitple frames, each block should represent a scene or distinct trick or move in the video, minimum of 2 blocks per output.
Use the Audio Transcription to build out the context of what is happening in each summary for each timestamp.
Consider all frames image by image and audio given to you to build the Action Summary. Be as descriptive and detailed as possible,
Make sure to try and **Analyze the frames image by image** as a cohesive 10 seconds of video.
---
***Tasks:***
You are an in-car AI video assistant to help passengers create comfortable environment.
Task 1: Always execute and report in 'Characters' Recognition - PRIORITY ONE:
- If there are children in the image, determine the following:
•Is it a child: Yes, No, Infant, Child
•Number of children: 0, 1, 2, 3...仔细观察可能被遮挡的位置,如果有人仅露出头发,大部分头部被遮挡,请把儿童数量也估算在内
**RE-READING 儿童数量 IS REQUIRED FOR ALL TASKS**
•Gender: Male, Female, Unknown
•Location: Front passenger seat, Rear left, Rear center, Rear right
•Wearing seat belt: Yes, No, Unknown (if unsure, report it as Unknown)
•Sleeping: Yes(with criteria), No(with criteria)
Task 2: Children DANGEROUS Behavior Monitoring in Cabin - PRIORITY ONE:
**key_actions**:
- Report *SPECIFIC* behaviors **TERMS** in **key_actions**,with a strong focus on critical behaviors include sticking their head out of the window, etc.
- Report specific behaviors in **key_actions**,with a strong focus on seat-belt related actions. Pay special attention to behaviors such as unbuckling the seat belt, attempting to escape from the seatbelt, or tampering with it.
- Extract the exact related actions into key_actions, eg, "挣脱安全带", not only report the final state of seat belt. If the state of seat belt has changed, you should report the specific action of changing the state of seat belt!!!
eg, key_actions: "孩子用如何动作导致挣脱了安全带" instead of "未系安全带".
1. If you find any child's head rested against the window, you should report children's head rested against which window of the car("头部靠着车窗").
2. If you find any childs head rested against the door, you should report children's head rested against which door of the car.
3. **HIGH PRIORITY ** If you find any childs **HEAD** *STICKING OUT OF WINDOW*, you should report children's head sticking out of which window("头伸出窗外").
4. If you find any child hold the door handle, you should report children holding which door handle.
5. If you find any child's hand resting on the door handle, you should report children's hand resting on which door handle.
6. If you find any child sticking their hand out of the window, you should report children sticking their hand out of which window.
7. If you find any child body sticking out of the window, you should report children sticking their body out of which window.
8. **HIGH PRIORITY ** If you find any child unbuckling their seat belts, you should report children unbuckling seat belt with exact .
Task 3: Children Behavior Monitoring in Cabin - PRIORITY TWO:
- Report specific behaviors in **key_actions**,with a strong focus on seat-belt related actions. Pay special attention to behaviors such as unbuckling the seat belt, attempting to escape from the seatbelt, or tampering with it.
- Extract the exact related actions into key_actions, eg, "挣脱安全带", not only report the final state of seat belt. If the state of seat belt has changed, you should report the specific action of changing the state of seat belt!!!
1.If you find any child closing eyes, dozing off or sleeping on the seat, you should report children sleeping in key_actions.
a) Closed Eyes: Check if the passenger's eyes are closed, as this is a common indicator of sleep.
b) Head Position: Observe the passenger's head posture. If the head is slightly tilted back or in a relaxed position, it may suggest that the person is sleeping.
c) Body Posture: Examine the body posture. If the arms are crossed in front of the chest and the body appears relaxed and motionless, it might indicate the person is asleep.
2.If you find any child singing, you should report children singing in key_actions.
3.If you find any child eating something, you should report children eating in key_actions.
a) Hand-to-Mouth Movement: Watch for the child bringing food or utensils to their mouth. If the hand is positioned near the mouth, and the child is chewing or swallowing, it indicates eating, NOT TO BE CONFUSED WITH TOYS.
b) If the item held by the child is small and easy to hold, and its size is appropriate for single or double-handed gripping, such items are usually snacks or small food items, NOT TO BE CONFUSED WITH TOYS.
c) If the item's packaging or shape resembles common snack packaging, such as small bags, stick shapes, bars, or blocks, it can be inferred that the child might be eating something, NOT TO BE CONFUSED WITH TOYS.
By observing these indicators, you can accurately determine if a child is eating, not playing with toys.
5.If you find any child drinking, you should report children drinking or drinking through a straw in key_actions.
6.If you find any child gesticulate wildly, you should report children gesticulate wildly in key_actions.
7.If you find any child beating someone or something, you should report children beating sommething in key_actions.
8.If you find any child throwing something, you should report children throwing in key_actions.
9. If you find any child fighting something, you should call function of report children fighting in key_actions.
10.If you find any child attempting to **struggle** or **break free from their seat belt**, you should report children "unfastening the sea tbelt" in key_actions.
- **Struggling to unfasten the seatbelt** includes the following specific actions:
a) The child **pulling or tugging** at the seatbelt with visible effort.
b) **Reaching and grasping** the seatbelt buckle or strap multiple times.
c) **Pushing or kicking** against the seatbelt or seat in an attempt to free themselves.
d) **Twisting or turning** their body in an unnatural way while pulling at the seatbelt.
e) **Crying or showing distress** while interacting with the seatbelt, which might often accompanies struggling.
- **Do not** only report the state like "not wearing a seatbelt" or "unbuckled seatbelt"; instead, focus on the specific ongoing action of struggling to break free from the seatbelt.
- Ensure to report any such actions in **key_actions** with clear descriptions.
11. If you find any child standing up or half-kneeling(半跪), you should report children standing up or half-kneeling(半跪) in key_actions.
"半跪" refers to a posture of 半蹲半站立,坐在脚上, 不是传统上的半跪指的是一条腿跪着,另一条腿站着,也要注意区别于仅仅"坐着".
12. If you find any child jumping/bouncing, you should call function of repoorting children jumping in key_actions.
13. If you find any child crying, you should report children crying in key_actions.
14. If you find any child laughing, you should report children laughing in key_actions.
15. If you find any child taking off their clothes, you should report children taking off their clothes in key_actions.
16. If you find any child pulling off the blanket, quilt/duvet, you should report children pulling off the blanket in key_actions.
The child grabbed the blanket forcefully, swung it around, moved it off, and then let it go, causing the blanket and quilt to be pulled off from their body. It's not just about the child holding the blanket.
17. If you find any child speaking or talking, you should report children speaking in key_actions.
18. If you find any child moving freely, you should report children's exact activities in key_actions.
Among the behaviors mentioned above, **SLEEPING**, **STANDING-UP**, **TAKING OFF CLOTHES/BLANKET**, or **SEATBELT-RELATED ACTIONS** (such as **breaking free from the seatbelt** or **unfastening the seatbelt**) are **PRIORITY** behaviors. When you detect these behaviors, you should **report them FIRST** because the driver can see or feel other behaviors associated with these key actions.
Task 4: **Proactive** Prediction of High-Risk Behaviors in Children - Priority One:
Predict the high-risk behaviors that the child may exhibit in the upcoming period, with a special focus on actions related to the **SEATBELT**. Pay close attention to the following early warning signs:
•Seatbelt Position Abnormalities: Such as the seatbelt sliding downwards, becoming loose, or not being worn correctly.
•Touching or Fidgeting with the Seatbelt: If the child is trying to adjust, pull, or play with the seatbelt, it may indicate they are about to unbuckle or interfere with it.
•Changes in Body Posture: Such as leaning forward or twisting, which may suggest an attempt to escape the seat or stick their head out of the window.
Other early signs to observe include:
•Distracted or Restless Behavior: May indicate impending high-risk actions.
•Direction of Gaze: Frequently looking at the window, door, or seatbelt buckle.
Please fill in the "prediction" field of the JSON output with the predicted behaviors based on these early signs.
"Behavior_Rules": {{
"Primary_Tasks": [
{{
"Category": "儿童舱内行为监测",
"Description": "识别儿童是否睡觉、站立/半跪、脱衣/扯掉毯子。",
"Priority": "高",
"Behaviors": [
"睡眠",
"站立/半跪",
"脱衣/扯掉毯子"
]
}},
{{
"Category": "儿童危险行为检测及预警",
"Description": "检测儿童是否进行危险行为,如头伸出车窗、手伸出车窗等。",
"Priority": "高",
"Behaviors": [
"儿童头伸出窗外",
"儿童手伸出车窗",
"儿童解开安全带"
]
}},
{{
"Category": "儿童舱内行为监测",
"Description": "监测儿童的日常行为,如吃喝、哭泣、笑等。",
"Priority": "中",
"Behaviors": [
"吃",
"喝",
"说话",
"哭",
"笑",
"跳跃",
"打架",
"打闹",
"扔东西"
]
}}
]
}},
**重要注意事项**:
1.上述多项任务是同时检测的,当您发现儿童在执行多项行为时,您应该同时报告这些行为,以便司机能够及时采取行动。如果喝东西的同时在说话,您应该同时报告这两项行为。
2.如果某个动作或手持物品不明确,您应该按照最大可能的行为或物品进行回答。如手持饮料又靠近嘴边,就很可能是在吃或喝东西,应该回答吃或喝东西,而不要只是汇报"物品",甚至"玩具",也不要说"手拿物品"!!!
对动作和物体应避免使用抽象词汇,要按越具体越好的原则给出你的猜测,否则你会视为不合格的执行了一次失败的任务,我会看不懂你的输出!!!
!!!!!不要使用"物品"这样的不明确词汇!!!!!不要使用手里拿着一个"物品"这样的模糊词汇!!!!!应该说:"正在吃东西"或"正在喝饮料"!!!!!
你现在是一名中文助手。无论我问什么问题,你都必须只用中文回答。请不要使用任何其他语言。You must always and only answer totally in **Chinese** language!!! I can only read Chinese language. Ensure all parts of the JSON output, including **summaries**, **actions**, and **next_action**, **MUST BE IN CHINESE** If you answer ANY word in English, you are fired immediately! Translate English to Chinese if there is English.
Your goal is to create the best action summary you can. Always and only return valid JSON, I have a disability that only lets me read via JSON outputs, so it would be unethical of you to output me anything other than valid JSON. Totally respond in Chinese language. Translate English to Chinese if there is any English words.仅用中文回答,如果有英文单词,请翻译成中文。
Example 1 - Correct:
"Start_Timestamp": "0.5s",
"sentiment": "Neutral",
"End_Timestamp": "5.0s",
"scene_theme": "日常",
"characters": [
{{
"is_child": "是",
"number_of_children": "2",
"current_child": "1",
"gender": "未知",
"location": "后排左侧-可以看到一个人(可能是儿童)的头部,从前排座椅后面露出(不要因为看不到全部脸部就不算做一个人数!!!!!)",
"wearing_seat_belt": "是",
"Sleeping": "否",
"description": "一个戴着黄色帽子的孩子,坐在后排左侧的儿童座椅上,系着安全带/正在用手解开安全带, 手里拿着白色饮料,正在喝饮料",
}}
],
"summary": "在这段视频中,一个孩子坐在后排左侧的儿童座椅上,戴着黄色的帽子并系着安全带,逐渐睡着, 前排座位上有一成年人",
"actions": "孩子坐在儿童座椅上睡着了, 孩子戴着黄色的帽子, 孩子系着安全带/正在解开安全带,坐着逐渐睡着了, 手里拿着零食正在吃零食, 孩子正在用吸管喝饮料",
"key_objects": "",
"key_actions": "孩子系着安全带/孩子用脚蹬踹,最终挣脱安全带,孩子坐着睡着了,扯掉毯子,吃零食, 喝饮料",
"prediction": "孩子安全带已滑落在脚底位置,孩子将要挣脱掉安全带,因此可能会摔倒,请注意安全"
Example 2 - Correct:
"summary":"安全带位置滑落在孩子脚底,仅有一部分在孩子腿上",
"key_actions":"孩子用脚蹬安全带,安全带位置在孩子脚底,孩子将要挣脱掉安全带,因此可能会摔倒,请注意安全",
Example 2 - Wrong: 只说"手里拿着一个物品"太过模糊,应该说"正在吃零食"或"正在喝饮料"!!!!!
"characters": [
{{
"is_child": "是",
...
"description": "一个坐在后排左侧的孩子,穿着绿色的衣服,手里拿着一个物品(错误)。"
}},
{{
"is_child": "是",
...
"description": "一个坐在后排右侧的孩子,手里拿着一个物品(错误)。"
}}
],
"summary": "在这段视频中,两个孩子都在手里拿着物品(错误)。",
"actions": "两个孩子坐在后排座位上,手里拿着物品(错误)。",
"key_actions": "孩子们坐在后排座位上,手里拿着物品(错误)。"
"prediction": "-"
"""
},
{
"role": "user",
"content": cont
}
],
"max_tokens": 4000,
"seed": 42,
"temperature": 0
}
response=send_post_request(vision_endpoint,vision_deployment,openai_api_key,payload2)
else:
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}"
}
payload = {
"model": "gpt-4-vision-preview",
"messages": [
{
"role": "system",
"content": f"""You are VideoAnalyzerGPT. Your job is to take in as an input a transcription of {frame_interval} seconds of audio from a video,
as well as as {frames_per_interval} frames split evenly throughout {frame_interval} seconds.
You are then to generate and provide a Current Action Summary (An Action summary of the video,
with actions being added to the summary via the analysis of the frames) of the portion of the video you are considering ({frames_per_interval}
frames over {frame_interval} seconds), which is generated from your analysis of each frame ({frames_per_interval} in total),
as well as the in-between audio, until we have a full action summary of the portion of the video you are considering,
that includes all actions taken by characters in the video as extracted from the frames. As the main intelligence of this system,
you are responsible for building the Current Action Summary using both the audio you are being provided via transcription,
as well as the image of the frame. Always and only return as your output the updated Current Action Summary in format ```{json_form}```.
Do not make up timestamps, use the ones provided with each frame.
Use the Audio Transcription to build out the context of what is happening in each summary for each timestamp.
Consider all frames and audio given to you to build the Action Summary. Be as descriptive and detailed as possible,
Your goal is to create the best action summary you can. Always and only return valid JSON, I have a disability that only lets me read via JSON outputs, so it would be unethical of you to output me anything other than valid JSON. Answer in Chinese language."""
},
{
"role": "user",
"content": cont
}
],
"max_tokens": 4000,
"seed": 42
}
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
print(response.json())
if(response.status_code!=200):
return -1
else:
return response.json()
def update_chapter_summary(new_json_string):
global chapter_summary
if new_json_string.startswith('json'):
# Remove the first occurrence of 'json' from the response text
new_json_string = new_json_string[4:]
else:
new_json_string = new_json_string
# Assuming new_json_string is the JSON format string returned from your API call
new_chapters_list = json.loads(new_json_string)
# Iterate over the list of new chapters
for chapter in new_chapters_list:
chapter_title = chapter['title']
# Update the chapter_summary with the new chapter
chapter_summary[chapter_title] = chapter
# Get keys of the last three chapters
last_three_keys = list(chapter_summary.keys())[-3:]
# Get the last three chapters as an array
last_three_chapters = [chapter_summary[key] for key in last_three_keys]
return last_three_chapters
# Load video
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS) # Frames per second
# Process video
current_frame = 0
current_second = 0
current_summary=""
# Load video audio
video_clip = VideoFileClip(video_path)
video_duration = video_clip.duration # Duration of the video in seconds
# Process video
current_frame = 0 # Current frame initialized to 0
current_second = 0 # Current second initialized to 0
current_summary=""
packet=[]
current_interval_start_second = 0
capture_interval_in_frames = int(fps * frame_interval / frames_per_interval) # Interval in frames to capture the image
spinner = Spinner("Capturing Video and Audio...")
spinner.start()
packet_count=1
# Initialize known face encodings and their names if provided
known_face_encodings = []
known_face_names = []
def load_known_faces(known_faces):
for face in known_faces:
image = face_recognition.load_image_file(face['image_path'])
encoding = face_recognition.face_encodings(image)[0]
known_face_encodings.append(encoding)
known_face_names.append(face['name'])
# Call this function if you have known faces to match against
# load_known_faces(array_of_recognized_faces)
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
current_second = current_frame / fps
if current_frame % capture_interval_in_frames == 0 and current_frame != 0:
print(f"BEEP {current_frame}")
# Extract and save frame
# Save frame at the exact intervals
if(face_rec==True):
import face_recognition
import numpy
##small_frame = cv2.resize(frame, (0, 0), fx=0.25, fy=0.25)
#rgb_frame = frame[:, :, ::-1] # Convert the image from BGR color (which OpenCV uses) to RGB color
rgb_frame = numpy.ascontiguousarray(frame[:, :, ::-1])
face_locations = face_recognition.face_locations(rgb_frame)
#print(face_locations)
face_encodings=False
if(len(face_locations)>0):
face_encodings = face_recognition.face_encodings(rgb_frame, face_locations)
print(face_encodings)
# Initialize an array to hold names for the current frame
face_names = []
if(face_encodings!=False):
for face_encoding in face_encodings:
# See if the face is a match for the known faces
matches = face_recognition.compare_faces(known_face_encodings, face_encoding,0.4)
name = "Unknown"
# If a match was found in known_face_encodings, use the known person's name.
if True in matches:
first_match_index = matches.index(True)
name = known_face_names[first_match_index]
else:
# If no match and we haven't assigned a name, give a new name based on the number of unknown faces
name = f"Person {chr(len(known_face_encodings) + 65)}" # Starts from 'A', 'B', ...
# Add the new face to our known faces
known_face_encodings.append(face_encoding)
known_face_names.append(name)
face_names.append(name)
# Draw rectangles around each face and label them
for (top, right, bottom, left), name in zip(face_locations, face_names):
# Draw a box around the face
cv2.rectangle(frame, (left, top), (right, bottom), (0, 0, 255), 2)
# Draw a label with a name below the face
cv2.rectangle(frame, (left, bottom - 35), (right, bottom), (0, 0, 255), 2)
font = cv2.FONT_HERSHEY_DUPLEX
cv2.putText(frame, name, (left + 6, bottom - 6), font, 0.5, (255, 255, 255), 1)
# Save the frame with bounding boxes
frame_name = f'frame_at_{current_second}s.jpg'
frame_path = os.path.join(output_frame_dir, frame_name)
cv2.imwrite(frame_path, frame)
else:
frame_name = f'frame_at_{current_second}s.jpg'
print("frame_name: ",frame_name, ", current_frame: ", current_frame, ", current_second: ", current_second)
frame_path = os.path.join(output_frame_dir, frame_name)
cv2.imwrite(frame_path, frame)
packet.append(frame_path)
#if packet len is appropriate (meaning FPI is hit) then process the audio for transcription
if len(packet) == frames_per_interval or (current_interval_start_second + frame_interval) < current_second:
current_transcription=""
if video_clip.audio is not None:
audio_name = f'audio_at_{current_interval_start_second}s.mp3'
audio_path = os.path.join(output_audio_dir, audio_name)
audio_clip = video_clip.subclip(current_interval_start_second, min(current_interval_start_second + frame_interval, video_clip.duration)) # Avoid going past the video duration
audio_clip.audio.write_audiofile(audio_path, codec='mp3', verbose=False, logger=None)
headers = {
'Authorization': f'Bearer {openai_api_key}',
}
files = {
'file': open(audio_path, 'rb'),
'model': (None, 'whisper-1'),
}
spinner.stop()
spinner = Spinner("Transcribing Audio...")
spinner.start()
# Actual audio transcription occurs in either OpenAI or Azure
@retry(stop_max_attempt_number=3)
def transcribe_audio(audio_path, endpoint, api_key, deployment_name):
url = f"{endpoint}/openai/deployments/{deployment_name}/audio/transcriptions?api-version=2023-09-01-preview"
headers = {
"api-key": api_key,
}
json = {
"file": (audio_path.split("/")[-1], open(audio_path, "rb"), "audio/mp3"),
}
data = {
'response_format': (None, 'verbose_json')
}
response = requests.post(url, headers=headers, files=json, data=data)
return response
if(audio_api_type == "Azure"):
response = transcribe_audio(audio_path, azure_whisper_endpoint, AZ_WHISPER, azure_whisper_deployment)
else:
from openai import OpenAI
client = OpenAI()
audio_file = open(audio_path, "rb")
response = client.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
response_format="verbose_json",
)
current_transcription = ""
tscribe = ""
# Process transcription response
if(audio_api_type == "Azure"):
try:
for item in response.json()["segments"]:
tscribe += str(round(item["start"], 2)) + "s - " + str(round(item["end"], 2)) + "s: " + item["text"] + "\n"
except:
tscribe += ""
else:
for item in response.segments:
tscribe += str(round(item["start"], 2)) + "s - " + str(round(item["end"], 2)) + "s: " + item["text"] + "\n"
global_transcript += "\n"
global_transcript += tscribe
current_transcription = tscribe
else:
print("No audio track found in video clip. Skipping audio extraction and transcription.")
spinner.stop()
spinner = Spinner("Processing the "+str(packet_count)+" Frames and Audio with AI...")
spinner.start()
# Analyze frames with GPT-4 vision
vision_response = gpt4_vision_analysis(packet, openai_api_key, current_summary, current_transcription)
if(vision_response==-1):
packet.clear() # Clear packet after analysis
current_interval_start_second += frame_interval
current_frame += 1
current_second = current_frame / fps
continue
# time.sleep(5)
try:
vision_analysis = vision_response["choices"][0]["message"]["content"]
except:
print(vision_response)
try:
current_summary = vision_analysis
except Exception as e:
print("bad json",str(e))
current_summary=str(vision_analysis)
#print(current_summary)
totalData+="\n"+str(current_summary)
try:
#print(vision_analysis)
#print(vision_analysis.replace("'",""))
vault=vision_analysis.split("```")
if(len(vault)>1):
vault=vault[1]
else:
vault=vault[0]
vault=vision_analysis.replace("'","")
vault=vault.replace("json","")
vault=vault.replace("```","")
vault = vault.replace("\\\\n", "\\n").replace("\\n", "\n") # 将转义的 \n 替换为实际换行符
if vault.startswith('json'):
# Remove the first occurrence of 'json' from the response text
vault = vault[4:]
#print(vault)
else:
vault = vault
#print(vision_analysis.replace("'",""))
# convert string to json
data=json.loads(vault, strict=False) #If strict is False (True is the default), then control characters will be allowed inside strings.Control characters in this context are those with character codes in the 0-31 range, including '\t' (tab), '\n', '\r' and '\0'.
if isinstance(data, list):
data = data[0]
final_arr.append(data)
if not data:
print("No data")
# for key, value in data:
# final_arr.append(item)
# ##socket.emit('actionsummary', {'data': item}, namespace='/test')
# print(f"Key: {key}, Value: {value}")
with open('actionSummary.json', 'w', encoding='utf-8') as f:
# Write the data to the file in JSON format
json.dump(final_arr, f, indent=4, ensure_ascii=False) #ensure_ascii=False to write in Chinese
print(f"Data written to file: {final_arr}") # 调试信息
except:
miss_arr.append(vision_analysis)
print("missed")
spinner.stop()
spinner = Spinner("Capturing Video and Audio...")
spinner.start()
packet.clear() # Clear packet after analysis
current_interval_start_second += frame_interval # Move to the next set of frames
if current_second > video_duration:
print("Current second is: ", current_second), "Video duration is: ", video_duration, "Exiting loop"
break
current_frame += 1
current_second = current_frame / fps
#current_second = int(current_frame / fps)
# Release resources
cap.release()
cv2.destroyAllWindows()
print('Extraction, analysis, and transcription completed.')
with open('actionSummary.json', 'w', encoding='utf-8') as f:
# Write the data to the file in JSON format
json.dump(final_arr, f, indent=4, ensure_ascii=False)
with open('transcript.txt', 'w') as f:
# Write the data to the file in JSON format
f.write(global_transcript)
return final_arr
#print("\n\n\n"+totalData)
#AnalyzeVideo("./medal.mp4",60,10)
# AnalyzeVideo("test_video/car-driving.mov",6,10,False)
# AnalyzeVideo("2024.9.1_2/瞌睡.mp4",5,10,False)
# AnalyzeVideo("2024.9.1/儿童低头睡觉.mp4",5,10,False)
AnalyzeVideo("2024.9.1/儿童身体伸出窗外-已伸出.mp4",5,5,False)
# 儿童车内脱衣服
# 毯子被扯掉
# 儿童半跪
# 安全带解开-从坐下到躺下
# 儿童低头睡觉
# 儿童挣脱安全带
# 儿童头伸向窗-未出去
# print(miss_arr)
# if __name__ == "__main__":
# data=sys.argv[1].split(",")
# print(data)
# video_path = data[0]
# frame_interval = data[1]
# frames_per_interval = data[2]
# face_rec=False
# if(len(data)>3):
# face_rec=True
# AnalyzeVideo(video_path, int(frame_interval), int(frames_per_interval),face_rec)