Merge pull request #31 from ieasybooks/json_output

إضافة صيغة json للمخرجات، إضافة اختيار --version
ieasybooks · Aug 23, 2023 · 08b7b34 · 08b7b34
2 parents 3760bf5 + afed356
commit 08b7b34
Show file tree

Hide file tree

Showing 4 changed files with 23 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -22,7 +22,7 @@
   <li>تفريغ المواد المرئي والمسموع إلى نصوص باستخدام أحدث تقنيات الذكاء الاصطناعي المقدمة من شركة OpenAI</li>
   <li>إمكانية تفريغ المواد باستخدام تقنيات wit.ai المقدمة من شركة Facebook</li>
   <li>تحميل المحتوى المرئي بشكل مباشر من منصة YouTube سواءً كان المستهدف مادة واحدة أو قائمة تشغيل كاملة</li>
-  <li>توفير صيَغ مخرجات مختلفة كـ <code>txt</code> و <code>srt</code> و <code>vtt</code></li>
+  <li>توفير صيَغ مخرجات مختلفة كـ <code>txt</code> و <code>srt</code> و <code>vtt</code> و <code>json</code></li>
 </ul>
 
 <h2 dir="rtl">متطلبات الاستخدام</h2>
@@ -134,6 +134,7 @@
           <li><code dir="ltr">txt</code></li>
           <li><code dir="ltr">srt</code></li>
           <li><code dir="ltr">vtt</code></li>
+          <li><code dir="ltr">json</code></li>
           <li><code dir="ltr">all</code> <strong>(الاختيار الإفتراضي)</strong></li>
           <li><code dir="ltr">none</code> (لن يتم إنشاء ملف في حال تمرير هذه الصيغة)</li>
         </ul>
@@ -149,7 +150,7 @@ usage: tafrigh [-h] [--skip_if_output_exist | --no-skip_if_output_exist] [--play
                [-l {af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh}]
                [--use_faster_whisper | --no-use_faster_whisper] [--beam_size BEAM_SIZE] [--ct2_compute_type {default,int8,int8_float16,int16,float16}] [-w WIT_CLIENT_ACCESS_TOKENS [WIT_CLIENT_ACCESS_TOKENS ...]]
                [--max_cutting_duration [1-17]] [--min_words_per_segment MIN_WORDS_PER_SEGMENT] [--save_files_before_compact | --no-save_files_before_compact] [--save_yt_dlp_responses | --no-save_yt_dlp_responses]
-               [--output_sample OUTPUT_SAMPLE] [-f {all,txt,srt,vtt,none} [{all,txt,srt,vtt,none} ...]] [-o OUTPUT_DIR]
+               [--output_sample OUTPUT_SAMPLE] [-f {all,txt,srt,vtt,json,none} [{all,txt,srt,vtt,json,none} ...]] [-o OUTPUT_DIR]
                urls_or_paths [urls_or_paths ...]
 
 options:
@@ -193,7 +194,7 @@ Output:
                         Whether to save the yt-dlp library JSON responses or not. (default: False)
   --output_sample OUTPUT_SAMPLE
                         Samples random compacted segments from the output and generates a CSV file contains the sampled data. Pass 0 to disable this behavior.
-  -f {all,txt,srt,vtt,none} [{all,txt,srt,vtt,none} ...], --output_formats {all,txt,srt,vtt,none} [{all,txt,srt,vtt,none} ...]
+  -f {all,txt,srt,vtt,json,none} [{all,txt,srt,vtt,json,none} ...], --output_formats {all,txt,srt,vtt,json,none} [{all,txt,srt,vtt,json,none} ...]
                         Format of the output file; if not specified, all available formats will be produced.
   -o OUTPUT_DIR, --output_dir OUTPUT_DIR
                         Directory to save the outputs.

diff --git a/tafrigh/types/transcript_type.py b/tafrigh/types/transcript_type.py
@@ -6,6 +6,7 @@ class TranscriptType(Enum):
     TXT = 'txt'
     SRT = 'srt'
     VTT = 'vtt'
+    JSON = 'json'
     NONE = 'none'
 
     def __str__(self):

diff --git a/tafrigh/utils/cli_utils.py b/tafrigh/utils/cli_utils.py
@@ -1,4 +1,5 @@
 import argparse
+import importlib
 import re
 
 from tafrigh.types.transcript_type import TranscriptType
@@ -17,6 +18,12 @@
 def parse_args(argv: list[str]) -> argparse.Namespace:
     parser = argparse.ArgumentParser()
 
+    parser.add_argument(
+        '--version',
+        action='version',
+        version=importlib.metadata.version('tafrigh'),
+    )
+
     input_group = parser.add_argument_group('Input')
 
     input_group.add_argument(

diff --git a/tafrigh/writer.py b/tafrigh/writer.py
@@ -1,3 +1,4 @@
+import json
 import os
 
 from pathlib import Path
@@ -45,6 +46,8 @@ def write(
             self.write_srt(file_path, segments)
         elif format == TranscriptType.VTT:
             self.write_vtt(file_path, segments)
+        elif format == TranscriptType.JSON:
+            self.write_json(file_path, segments)
 
     def write_txt(
         self,
@@ -67,6 +70,14 @@ def write_vtt(
     ) -> None:
         self._write_to_file(file_path, self.generate_vtt(segments))
 
+    def write_json(
+        self,
+        file_path: str,
+        segments: list[dict[str, Union[str, float]]],
+    ) -> None:
+        with open(file_path, 'w') as fp:
+            json.dump(segments, fp, ensure_ascii=False, indent=2)
+
     def generate_txt(self, segments: list[dict[str, Union[str, float]]]) -> str:
         return '\n'.join(list(map(lambda segment: segment['text'].strip(), segments))) + '\n'