modified README.md and added etl pipeline code

LorenaG123 · LorenaG123 · commit c68b4207ffbc · 2024-03-11T23:58:51.000+01:00
diff --git a/README.md b/README.md
@@ -1 +1,12 @@
-# aws_streaming_pipeline
+## AWS Streaming Pipeline for real-time Cryptocurrency Price analysis
+
+This AWS streaming pipeline ingests real-time Cryptocurrency price data from the CoinGecko API, transform it for analysis and stores it in a ready-to-use format. The pipeline leverage the following services:
+- **AWS Kinesis and Firehose**: Continuously streams data from the CoinGecko API in real-time.I also implement data partitioning within Firehose to improve performance. 
+- **Amazon S3**: Serves as the data lake for storing the raw cryptocurrency price data.
+- **AWS Glue**: Provides a job to transform the raw data into a schema optimized for analytics.
+- **AWS Lambda**: Acts as an event-driven trigger, initiating the Glue job whenever new data arrives in the S3 raw layer.
+- **Amazon SNS**: Publishes notifications (in SNS's topic) about new data arrivals in S3, which are then picked up by the Lambda function.
+
+This architecture ensures that the pipeline automatically processes incoming data, keeping the analytical layer up-to-date with the lastest information. 
+
+![Pipeline Architecture](images/architecture.png)
diff --git a/coingecko_etl_pipeline/api_to_kinesis.py b/coingecko_etl_pipeline/api_to_kinesis.py
@@ -0,0 +1,94 @@
+import requests
+from dotenv import load_dotenv
+import os
+import json
+import boto3
+import asyncio
+from aiohttp import ClientSession
+import time
+import uuid
+
+def extract_subset(actual_json_dictionary, selected_columns):
+    extract_json_dict=[]
+    extracted_json = {}
+    for entry in actual_json_dictionary: 
+        for column in selected_columns:
+            extracted_json[column]=entry[column]
+        extract_json_dict.append(extracted_json)
+    return extract_json_dict
+
+async def fetch_market_cap_data(currency, api_key, queue):
+    try: # added this to cancel the task b runnin the task.cancel
+        while True:
+            print("Fetching data...")
+            url = f"https://api.coingecko.com/api/v3/coins/markets?vs_currency={currency}&x_cg_demo_api_key={api_key}"
+            async with ClientSession() as session:
+                async with session.get(url) as response:
+                    if response.status == 200:
+                        # success
+                        market_cap_data = await response.json()
+                        # put data in queue
+                        print("Adding item to the queue...")
+                        await queue.put(market_cap_data)
+                        print("Item added to the queue.")
+                    else:
+                        # Handle the error
+                        print("API request failed with status code:", response.status_code)
+                    
+            await asyncio.sleep(70) # 70 Seconds between calls of the API this is the time for Cache/Update Frequency for public API
+    except asyncio.CancelledError:
+        print("Task canceled. Exiting fetch_market_cap_data.")
+             
+async def send_batch_to_kinesis(stream_name, queue, kinesis_streams):
+    while True:
+        try:
+            batch_data = await asyncio.wait_for(queue.get(), timeout=5)
+        except asyncio.TimeoutError:
+            if queue.empty():
+                print("queue is empty, stopping task.")
+                break
+        else:
+            try:
+                for idx, data in enumerate(batch_data):
+                    encoded_data = json.dumps(data).encode('utf-8')
+                    #print(encoded_data)
+                    kinesis_streams.send_stream(stream_name, encoded_data, None)
+            except Exception as e:
+                print(f"Erro while sending to kinesis : {e}" )
+            
+class KinesisStream():
+    def __init__(self, region_name='eu-west-3'):
+        self.kinesis_client = boto3.client('kinesis', region_name=region_name)
+        
+    def send_stream(self, stream_name, data, partition_key=None):
+        if partition_key == None:
+            partition_key = str(uuid.uuid4())
+            
+        try:
+            response = self.kinesis_client.put_record(
+                StreamName=stream_name,
+                Data=data,
+                PartitionKey=partition_key
+                )
+            #print("hello response is here ------------")
+            #print(response)
+        except self.kinesis_client.exceptions.ResourceNotFoundException:
+            print(f"Kinesis stream '{stream_name}' not found")
+
+async def main():
+    load_dotenv()
+    api_key = os.environ['api_key']
+    queue = asyncio.Queue()
+    stream_name = "api-to-kinesis-streams-coingecko"
+    kinesis_streams = KinesisStream()
+    
+    producer_task = asyncio.create_task(fetch_market_cap_data("usd",api_key, queue))
+    
+    await asyncio.wait_for(producer_task, timeout=71)
+    
+    sending_task = asyncio.create_task(send_batch_to_kinesis(stream_name,queue,kinesis_streams))  
+    
+    await sending_task
+    
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/coingecko_etl_pipeline/glue_job.py b/coingecko_etl_pipeline/glue_job.py
@@ -0,0 +1,72 @@
+import sys
+from awsglue.transforms import *
+from pyspark.sql import functions as F
+from pyspark.sql.functions import col, first, expr
+from awsglue.utils import getResolvedOptions
+from pyspark.context import SparkContext
+from awsglue.context import GlueContext
+from awsglue.dynamicframe import DynamicFrame
+from awsglue.job import Job
+  
+sc = SparkContext.getOrCreate()
+glueContext = GlueContext(sc)
+spark = glueContext.spark_session
+job = Job(glueContext)
+
+# Read data in Json format from s3 => stored in glue catalog
+
+def create_df_froms3():
+    dyf = glueContext.create_dynamic_frame.from_catalog(database='coingecko_database', table_name='data')
+    df = dyf.toDF()
+    return df
+
+def remove_duplicate(df):
+    # Remove duplicates records based on selected columns 
+    df_deduplicate = df.dropDuplicates(["id","last_updated"])
+    return df_deduplicate
+    
+def drop_columns(df):
+    # drop from table columns with struct type
+    cols = ("roi", "image", "ath", "ath_change_percentage", "ath_date", "atl", "atl_change_percentage", "atl_date")
+
+    df_deduplicate= df.drop(*cols)
+    return df_deduplicate
+    
+def clean_structure(df):
+    df = df.withColumn("market_cap", expr("coalesce(market_cap.int, market_cap.long)"))\
+                        .withColumn("current_price", expr("coalesce(current_price.double, current_price.int)"))\
+                        .withColumn("fully_diluted_valuation", expr("coalesce(fully_diluted_valuation.int, fully_diluted_valuation.long)"))\
+                        .withColumn("total_volume", expr("coalesce(total_volume.int, total_volume.long)"))\
+                        .withColumn("high_24h", expr("coalesce(high_24h.double, high_24h.int)"))\
+                        .withColumn("low_24h", expr("coalesce(low_24h.double, low_24h.int)"))\
+                        .withColumn("market_cap_change_24h", expr("coalesce(market_cap_change_24h.double, market_cap_change_24h.int, market_cap_change_24h.long)"))
+    #df_deduplicate.printSchema()
+    return df
+
+
+if __name__ == "__main__":
+    df = create_df_froms3()
+    df_deduplicate = remove_duplicate(df)
+    df_w_drop = drop_columns(df_deduplicate)
+    df_final = clean_structure(df_w_drop)
+    
+    # going from Spark dataframe to glue dynamic frame
+    glue_dynamic_frame = DynamicFrame.fromDF(df_final, glueContext, "glue_etl")
+
+    s3output = glueContext.getSink(
+    path="s3://coingecko-clean-datalake/clean_coins_data/",
+    connection_type="s3",
+    updateBehavior="UPDATE_IN_DATABASE",
+    partitionKeys=[],
+    compression="snappy",
+    enableUpdateCatalog=True,
+    transformation_ctx="s3output",
+    )
+
+    s3output.setCatalogInfo(
+    catalogDatabase="coingecko_database", catalogTableName="clean_marketcap_data"
+    )
+
+    s3output.setFormat("glueparquet")
+    s3output.writeFrame(glue_dynamic_frame)
+    job.commit()
diff --git a/images/architecture.png b/images/architecture.png