glider-documentation/html/_create_snapshots_8py_source.html

import os

import ssl

import boto3

import sys

import traceback

import awswrangler as wr

import pandas as pd

from datetime import datetime as dt

from dotenv import load_dotenv

from pathlib import Path

# from pymongo import MongoClient

from ErrorHandler import *

# from TemplateManager import TemplateManager

from MongoConnection import mongo_connection

# from CreateSnapshotsFonal import snapshot_playground


# dotenv_path = Path("src/importer/.env")

load_dotenv()


mongo_conn = mongo_connection()


collection = mongo_conn.mongo_conn_snapshots()


ACCESS_ID, ACCESS_KEY = os.environ.get("AWS_KEY_ID"), os.environ.get("AWS_KEY_SECRET")


s3_session = boto3.Session(aws_access_key_id=ACCESS_ID, aws_secret_access_key=ACCESS_KEY)


def get_data(file, bucket_out, path_out):

    """Receives a S3 path and loads the data using awswrangler


    Args:

        file (str): current filename

        bucket_out (str): bucket where parquet file is storage

        path_out (str): s3 path where parquet file is storage

    Returns: df (pandas dataframe)

    """

    s3_filename = "s3://{bucket}/{path}/{file}".format(bucket=bucket_out,path=path_out,file=file)

    df = wr.s3.read_parquet(s3_filename, boto3_session=s3_session)

    return df


def group_by_field(data, fields, snapshot):

    """Group by service_id, territory_code, artists and tracks


    Args:

        data (pandas dataframe): current parquet file loaded as dataframe

        fields (list): list of fields that are considered to snapshot

        snapshot (dict): it will contain all fields info

    Returns: snap (dict)

    """

    snap_fields = {"service_id":"byDSP", "territory_code": "byTerritory", "artists":"byArtist", "track_title": "byTrack", "release_id": "byRelease"}

    # snap_fields = {"artists":"byArtist", "track_title": "byTrack", "release_id": "byRelease"}

    for field in fields:

        snap_field = snap_fields[field]

        if field == "track_title":

            grouped_df = data.groupby('track_title').agg({

                'total_local': 'sum',

                'isrc_id': 'first',

                'artists': 'first'

            }).reset_index()

            # grouped_df = data.groupby('track_title').agg({

            #     'total_local': 'sum',

            #     'isrc_id': 'first',

            #     'artists': 'first',

            #     'release_id': lambda x: pd.Series(x).unique().tolist(),

            #     'territory_code': lambda x: pd.Series(x).unique().tolist(),

            #     'service_id': lambda x: pd.Series(x).unique().tolist()}).reset_index()

        elif field == "artists":

            grouped_df = data.groupby('artists').agg({

                'total_local': 'sum',

                # 'release_id': 'first'

                'release_id': lambda x: pd.Series(x).unique().tolist()

            }).reset_index()

        else:

            grouped_df = data.groupby(field).agg({

                'total_local': 'sum',

            }).reset_index()

        grouped_df.rename(columns={field: 'key', 'total_local': 'total'}, inplace=True)

        snapshot[snap_field] = grouped_df.to_dict(orient='records')

        # df_grouped = data.groupby(field)["total_local"].sum()

        # snap = build_snapshot(df_grouped, field, snapshot)

    return snapshot


def upload_mongo(results, file_db_id, update, length):

    """Update fields using snapshot variable.


    Args:

        results (dict): current parquet file loaded as dataframe

        file_db_id (str): list of fields that are considered to snapshot

        update (datetime stamp): the current datetime where snapshot is updated

        length (int): total rows

    Returns: Nothing

    """

    for field in results:

        collection.update_one({"file_db_id":file_db_id},{"$set": {field: results[field]}})

    collection.update_one({"file_db_id":file_db_id},{"$set": {"status": "ingested", "status_cause": "No Errors", "error_message": "No message","updated_at": update, "total_rows": length}})


def results_grouped(event, context):

    """Executes full procedure


    Args:

        event (dict): is a dictionary with all client and sales information

        context (none): it's required just for lambda execution

    Returns: (dict)

    """

    if event["status"] != "OK":

        return {"status": "No snapshots generated due: "+str(event["status"])}

    snapshot_created = dict()

    snapshot_not_created = dict()

    status = "Snapshots Created"

    # playground_formats = [event["format"][i] for i in event["format"] if "playground" in i]

    # if playground_formats:

    #     snapshot_playground(event, playground_formats)

    for formats in event["format"]:

        # if "playground" in formats:

        #     continue

        files = event["format"][formats]["files"]

        if len(files) == 0:

            continue

        for file in files:

            try:

                file_db_id = file["file_id"]

                filename = file["file"]

                filename = filename+".parquet"

                fields = ["service_id", "territory_code", "artists", "track_title", "release_id"]

                # fields = ["artists", "track_title", "release_id"]

                df = get_data(filename, event["results_bucket"], event["results_path"])

                total_rows = df.shape

                snapshot = dict({"byArtist":[], "byDSP": [], "byTerritory":[], "byTrack":[], "byRelease": []})

                # snapshot = dict({"byArtist":[], "byTrack":[], "byRelease": []})

                snapshot = group_by_field(df, fields, snapshot)

                period = str(df.loc[0,"period"])


                if len(period) > 7:

                    period = period[:7].replace("-","")

                # elif len(period) == 7:

                else:

                    period = period.replace("-","")

                try:

                    results = {

                        "procedure_status": "OK",

                        "reporting_period": period,#.getType(filename, formats),

                        "total_local": float(df["total_local"].sum()),

                        "total_gross": float(df["total_gross"].sum()),

                        "total_net": float(df["total_net"].sum()),

                        "snapshot": snapshot

                    }

                except:

                    results = {

                        "procedure_status": "OK",

                        "reporting_period": period,#.getType(filename, formats),

                        "total_local": float(df["total_local"].sum()),

                        "snapshot": snapshot

                    }

                print("{}: {} Generating Snapshot".format(file_db_id, filename))

                update_at =  dt.now().replace(hour=0, minute=0, second=0, microsecond=0)

                if "playground_digital" in formats:

                    root_db_id = file_db_id.split("-")[:-1]

                    root_db_id = "-".join(root_db_id)

                    upload_mongo(results, root_db_id, update_at, total_rows[0])

                upload_mongo(results, file_db_id, update_at, total_rows[0])

                snapshot_created[file_db_id] = filename

            except Exception as e:

                file_db_id = file["file_id"]

                # print(sys.exc_info()[2])

                # print(traceback.format_exc())

                print(e)

                m = "{}\n{}".format(sys.exc_info()[2], traceback.format_exc())

                eh  = ErrorHandler()

                error = eh.handle(e, m, file_db_id)

                status = "Some snapshots were not created"

                snapshot_not_created[file_db_id] = {"file": file["file"], "error": error}

                continue

            # return {"status":error}


    return {"procedure_status":status,

            "results_bucket": event["results_bucket"],

            "results_path": event["results_path"],

            "client_id": event["client_id"],

            "snapshots_created": snapshot_created,

            "no_snapshots": snapshot_not_created}


src.importer.ErrorHandler.ErrorHandler
Definition ErrorHandler.py:9

src.importer.MongoConnection.mongo_connection
Definition MongoConnection.py:8

src.importer.CreateSnapshots.group_by_field
group_by_field(data, fields, snapshot)
Definition CreateSnapshots.py:44

src.importer.CreateSnapshots.upload_mongo
upload_mongo(results, file_db_id, update, length)
Definition CreateSnapshots.py:86

src.importer.CreateSnapshots.get_data
get_data(file, bucket_out, path_out)
Definition CreateSnapshots.py:31

src.importer.CreateSnapshots.results_grouped
results_grouped(event, context)
Definition CreateSnapshots.py:101