11// code obtained from https://github.com/bbc/stt-align-node
22
3- import converterNumbersToWords from 'number-to-words' ;
3+ import { toWords } from 'number-to-words' ;
44import difflib from 'difflib' ;
55import everpolate from 'everpolate' ;
66
@@ -24,12 +24,12 @@ function removeTrailingPunctuation(str) {
2424 * handles edge case if word is undefined, and returns undefined in that instance
2525 */
2626function normaliseWord ( wordText ) {
27- if ( wordText !== undefined ) {
27+ if ( wordText ) {
2828 const wordTextResult = wordText . toLowerCase ( ) . trim ( ) . replace ( / [ ^ a - z | 0 - 9 | . ] + / g, '' ) ;
2929 if ( isANumber ( wordTextResult ) ) {
3030 const sanitizedWord = removeTrailingPunctuation ( wordTextResult ) ;
3131 if ( sanitizedWord !== '' ) {
32- return converterNumbersToWords . toWords ( sanitizedWord ) ;
32+ return toWords ( sanitizedWord ) ;
3333 }
3434 }
3535
@@ -100,30 +100,28 @@ function adjustTimecodesBoundaries(words) {
100100}
101101
102102function interpolate ( wordsList ) {
103- let words = interpolationOptimization ( wordsList ) ;
103+ const words = interpolationOptimization ( wordsList ) ;
104104 const indicies = [ ...Array ( words . length ) . keys ( ) ] ;
105105 const indiciesWithStart = [ ] ;
106106 const indiciesWithEnd = [ ] ;
107107 const startTimes = [ ] ;
108108 const endTimes = [ ] ;
109- // interpolate times for start
110- for ( let i = 0 ; i < words . length ; i ++ ) {
111- if ( 'start' in words [ i ] ) {
112- indiciesWithStart . push ( i ) ;
113- startTimes . push ( words [ i ] . start ) ;
109+
110+ words . forEach ( ( word , index ) => {
111+ if ( 'start' in word ) {
112+ indiciesWithStart . push ( index ) ;
113+ startTimes . push ( word . start ) ;
114114 }
115- }
116- // interpolate times for end
117- for ( let i = 0 ; i < words . length ; i ++ ) {
118- if ( 'end' in words [ i ] ) {
119- indiciesWithEnd . push ( i ) ;
120- endTimes . push ( words [ i ] . end ) ;
115+
116+ if ( 'end' in word ) {
117+ indiciesWithEnd . push ( index ) ;
118+ endTimes . push ( word . end ) ;
121119 }
122- }
120+ } ) ;
123121 // http://borischumichev.github.io/everpolate/#linear
124122 const outStartTimes = everpolate . linear ( indicies , indiciesWithStart , startTimes ) ;
125123 const outEndTimes = everpolate . linear ( indicies , indiciesWithEnd , endTimes ) ;
126- words = words . map ( ( word , index ) => {
124+ const wordsResults = words . map ( ( word , index ) => {
127125 if ( ! ( 'start' in word ) ) {
128126 word . start = outStartTimes [ index ] ;
129127 }
@@ -134,40 +132,31 @@ function interpolate(wordsList) {
134132 return word ;
135133 } ) ;
136134
137- return adjustTimecodesBoundaries ( words ) ;
135+ return adjustTimecodesBoundaries ( wordsResults ) ;
138136}
139137
140138/**
141139 *
142- * @param {array } sttData - array of STT words
140+ * @param {array } sttWords - array of STT words
143141 * @param {array } transcriptWords - array of base text accurate words
144142 */
145143function alignWords ( sttWords , transcriptWords ) {
146- // console.log(sttWords);
147- // # extract list of words
148- // sttWords=[words.get('word') for words in sttData]
149-
150144 // # convert words to lowercase and remove numbers and special characters
151- // sttWordsStripped = [re.sub('[^a-z]', '', word.lower()) for word in sttWords]
152145 const sttWordsStripped = sttWords . map ( ( word ) => {
153146 return normaliseWord ( word . word ) ;
154147 } ) ;
155148
156- // transcriptWordsStripped = [re.sub('[^a-z]', '', word.lower()) for word in transcriptWords]
157149 const transcriptWordsStripped = transcriptWords . map ( ( word ) => {
158150 return normaliseWord ( word ) ;
159151 } ) ;
160152 // # create empty list to receive data
161- // transcriptData = [{} for _ in range(len(transcriptWords))]
162153 const transcriptData = [ ] ;
163154 // empty objects as place holder
164155 transcriptWords . forEach ( ( ) => {
165156 transcriptData . push ( { } ) ;
166157 } ) ;
167158 // # populate transcriptData with matching words
168- // matcher = difflib.SequenceMatcher(None, sttWordsStripped, transcriptWordsStripped)
169159 // // if they are same length, just interpolate words ?
170- // const matcher = diffWordMode(transcriptWordsStripped, sttWordsStripped);
171160 // http://qiao.github.io/difflib.js/
172161 const matcher = new difflib . SequenceMatcher ( null , sttWordsStripped , transcriptWordsStripped ) ;
173162 const opCodes = matcher . getOpcodes ( ) ;
0 commit comments