setup.yaml

AWSTemplateFormatVersion: 2010-09-09
Description: Voice Crawler Template
Parameters:
  RawAudioBucketName:
    Type: String
    Description: S3 Bucket name for raw audio files
    Default: raw-audio-voice-crawler
  TranscribedAudioBucketName:
    Type: String
    Description: S3 Bucket name for transcribed audio files
    Default: transcribed-audio-voice-crawler
  LogRetentionInDays:
    Type: String
    Description: Number of log retention days
    Default: 1
  AudioLanguageCode:
    Type: String
    Description: Language code of translated audio files
    Default: en-US

Resources:
  RawAudioS3Bucket:
    Type: AWS::S3::Bucket
    Properties:
      BucketName: !Ref RawAudioBucketName
      NotificationConfiguration:
        LambdaConfigurations:
          - Event: s3:ObjectCreated:*
            Function: !GetAtt TranscribeAudioLambdaFunction.Arn
            Filter:
              S3Key:
                Rules:
                  - Name: suffix
                    Value: .mp3
  S3InvokeTranscribeAudioLambdaPermission:
    Type: AWS::Lambda::Permission
    Properties:
      Action: lambda:InvokeFunction
      FunctionName: !GetAtt TranscribeAudioLambdaFunction.Arn
      SourceArn: !Sub "arn:aws:s3:::${RawAudioBucketName}"
      SourceAccount: !Ref "AWS::AccountId"
      Principal: s3.amazonaws.com
  TranscribeAudioLambdaRole:
    Type: AWS::IAM::Role
    Properties:
      AssumeRolePolicyDocument:
        Version: "2012-10-17"
        Statement:
          - Effect: Allow
            Principal:
              Service: lambda.amazonaws.com
            Action:
              - sts:AssumeRole
      Path: "/"
      Policies:
        - PolicyName: TranscribeAudioLambdaFunctionPolicies
          PolicyDocument:
            Version: "2012-10-17"
            Statement:
              - Effect: Allow
                Action:
                  - s3:ListBucket
                Resource: !Sub "arn:aws:s3:::${RawAudioBucketName}"
              - Effect: Allow
                Action:
                  - s3:GetObject
                Resource: !Sub "arn:aws:s3:::${RawAudioBucketName}/*"
              - Effect: Allow
                Action:
                  - s3:PutObject
                Resource: !Sub "arn:aws:s3:::${TranscribedAudioBucketName}/*"
              - Effect: Allow
                Action:
                  - "transcribe:StartTranscriptionJob"
                Resource: "*"
  TranscribeAudioLambdaLogGroup:
    Type: AWS::Logs::LogGroup
    Properties:
      LogGroupName: !Sub "/aws/lambda/${TranscribeAudioLambdaFunction}"
      RetentionInDays: !Ref LogRetentionInDays
  TranscribeAudioLambdaLogPermissions:
    Type: AWS::IAM::Policy
    Properties:
      Roles:
        - !Ref TranscribeAudioLambdaRole
      PolicyName: !Sub "${AWS::Region}-TranscribeAudioLambdaLogGroup"
      PolicyDocument:
        Version: "2012-10-17"
        Statement:
          - Effect: Allow
            Action:
              - logs:CreateLogGroup
              - logs:CreateLogStream
              - logs:PutLogEvents
            Resource:
              - !Sub "arn:${AWS::Partition}:logs:${AWS::Region}:${AWS::AccountId}:log-group:/aws/lambda/${TranscribeAudioLambdaFunction}"
              - !Sub "arn:${AWS::Partition}:logs:${AWS::Region}:${AWS::AccountId}:log-group:/aws/lambda/${TranscribeAudioLambdaFunction}:*"
              - !Sub "arn:${AWS::Partition}:logs:${AWS::Region}:${AWS::AccountId}:log-group:/aws/lambda/${TranscribeAudioLambdaFunction}:*:*"
  TranscribeAudioLambdaFunction:
    Type: AWS::Lambda::Function
    Properties:
      Runtime: python3.9
      Timeout: 60
      Handler: index.lambda_handler
      Role: !GetAtt TranscribeAudioLambdaRole.Arn
      Environment:
        Variables:
          RAW_AUDIO_BUCKET_NAME: !Ref RawAudioBucketName
          TRANSCRIBED_AUDIO_BUCKET_NAME: !Ref TranscribedAudioBucketName
          AUDIO_LANGUAGE_CODE: !Ref AudioLanguageCode
      Code:
        ZipFile: |
          import os.path
          import urllib.parse
          import boto3

          RAW_AUDIO_BUCKET_NAME = os.environ['RAW_AUDIO_BUCKET_NAME']
          TRANSCRIBED_AUDIO_BUCKET_NAME = os.environ['TRANSCRIBED_AUDIO_BUCKET_NAME']
          AUDIO_LANGUAGE_CODE = os.environ['AUDIO_LANGUAGE_CODE']

          transcribe_client = boto3.client('transcribe')

          def transcribe_audio(event):
              fileName = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'])
              objectUrl = 'https://s3.amazonaws.com/{0}/{1}'.format(RAW_AUDIO_BUCKET_NAME, fileName)   
              print("Audio to transcribe: " + fileName)
              response = transcribe_client.start_transcription_job(
                      TranscriptionJobName=fileName,
                      LanguageCode=AUDIO_LANGUAGE_CODE,
                      MediaFormat='mp3',
                      Media={
                          'MediaFileUri': objectUrl
                      },
                      OutputKey=fileName.replace(".mp3", ".json"),
                      OutputBucketName=TRANSCRIBED_AUDIO_BUCKET_NAME
                      )
              print("Successfully transcribed audio to bucket: " + TRANSCRIBED_AUDIO_BUCKET_NAME)

          def lambda_handler(event, context):
              try:
                  transcribe_audio(event)
              except Exception as e:
                  print('Exception when transcribing audio')
                  print(e)
  TranscribedAudioS3Bucket:
    Type: AWS::S3::Bucket
    DependsOn: RawAudioS3Bucket
    Properties:
      BucketName: !Ref TranscribedAudioBucketName
  S3BucketCleanerLambdaRole:
    Type: AWS::IAM::Role
    Properties:
      AssumeRolePolicyDocument:
        Version: "2012-10-17"
        Statement:
          - Effect: Allow
            Principal:
              Service: lambda.amazonaws.com
            Action:
              - sts:AssumeRole
      Path: "/"
      Policies:
        - PolicyName: S3BucketCleanerLambdaFunctionPolicies
          PolicyDocument:
            Version: "2012-10-17"
            Statement:
              - Effect: Allow
                Action:
                  - s3:ListBucket
                Resource: !Sub "arn:aws:s3:::${RawAudioBucketName}"
              - Effect: Allow
                Action:
                  - s3:DeleteObject
                Resource: !Sub "arn:aws:s3:::${RawAudioBucketName}/*"
              - Effect: Allow
                Action:
                  - s3:ListBucket
                Resource: !Sub "arn:aws:s3:::${TranscribedAudioBucketName}"
              - Effect: Allow
                Action:
                  - s3:DeleteObject
                Resource: !Sub "arn:aws:s3:::${TranscribedAudioBucketName}/*"
  BucketCleanerLambdaInvoke:
    Type: AWS::CloudFormation::CustomResource
    Version: "1.0"
    Properties:
      ServiceToken: !GetAtt S3BucketCleanerLambdaFunction.Arn
  S3BucketCleanerLambdaFunction:
    Type: AWS::Lambda::Function
    DependsOn:
      - RawAudioS3Bucket
      - TranscribedAudioS3Bucket
    Properties:
      Runtime: python3.9
      Timeout: 60
      Handler: index.lambda_handler
      Role: !GetAtt S3BucketCleanerLambdaRole.Arn
      Environment:
        Variables:
          BUCKETS_TO_CLEAN:
            !Join [
              ",",
              [!Ref RawAudioBucketName, !Ref TranscribedAudioBucketName],
            ]
      Code:
        ZipFile: |
          import os.path
          import boto3
          import cfnresponse

          BUCKETS_TO_CLEAN = os.environ['BUCKETS_TO_CLEAN'].split(",")

          s3 = boto3.resource('s3')

          def clear_bucket(bucket):
              s3.Bucket(bucket).objects.all().delete()
              print("Successfully cleared bucket: " + bucket)

          def lambda_handler(event, context):
              responseData = {}
              if event['RequestType'] == 'Delete':
                  for bucket in BUCKETS_TO_CLEAN:
                      try:
                          clear_bucket(bucket)
                          cfnresponse.send(event, context, cfnresponse.SUCCESS, responseData)
                      except Exception as e:
                          print('Exception when cleaning bucket: ' + bucket)
                          print(e)
                          cfnresponse.send(event, context, cfnresponse.FAILED, responseData)
              else:
                  cfnresponse.send(event, context, cfnresponse.SUCCESS, responseData)