glider-documentation/html/_report_snapshot_8py_source.html

import ssl

import os

import awswrangler as wr

import boto3

import sys, traceback

from pandas import DataFrame, concat

from datetime import datetime as dt

from pymongo import MongoClient

from dotenv import load_dotenv

from botocore.exceptions import ClientError

f_path = __file__

index = f_path.find("report_generation/")

f_path = f_path[:index]

sys.path.insert(1, f_path+'importer')

from MongoConnection import mongo_connection

load_dotenv()


ACCESS_ID, ACCESS_KEY = os.environ.get("AWS_KEY_ID"), os.environ.get("AWS_KEY_SECRET")


mongo_conn = mongo_connection()


snap_collection = mongo_conn.mongo_conn_snapshots()


final_df = DataFrame()

total_rows = 0

total_local = 0


session = boto3.Session(aws_access_key_id=ACCESS_ID, aws_secret_access_key= ACCESS_KEY)


update_at =  dt.now().replace(hour=0, minute=0, second=0, microsecond=0)


def read_report_csv(filename, chunk):

    """Receives a S3 path and loads the data using awswrangler ans chunks (1M lines per chunk)


    Args:

        filename (str): s3 file location

        chunk (int): chunk size

    Returns: final_df (pandas dataframe)

    """

    global final_df, total_rows, total_local

    _dfs = wr.s3.read_csv(filename, low_memory=False, chunksize=chunk, keep_default_na=False, boto3_session=session)

    for df in _dfs:

        total_rows += df.shape[0]

        total_local += float(df["total_local"].sum())

        df.sort_values("total_local", ascending=False, inplace=True)

        df = df.iloc[:10000,:]

        final_df = concat([final_df, df], ignore_index=True)

    return final_df


def group_by_field(data, fields, snapshot):

    """Group by service_id, territory_code, artists and tracks


    Args:

        data (pandas dataframe): current parquet file loaded as dataframe

        fields (list): list of fields that are considered to snapshot

        snapshot (dict): it will contain all fields info

    Returns: snap (dict)

    """

    #Group by service_id, territory_code. artists

    for field in fields:

        df_grouped = data.groupby(field)["total_local"].sum()

        snap = build_snapshot(df_grouped, field, snapshot)

    return snap


def build_snapshot(df, field, snapshot):

    """Checks the field and fill the snapshot field using grouped data.


    Args:

        df (pandas dataframe): current parquet file loaded as dataframe

        field (str): current file used as filter

        snapshot (dict): it will contain all fields info

    Returns: snapshot (dict)

    """

    df.sort_values(ascending=False, inplace=True)

    for i in df.index[:50]:

        if field == "release_id":

            element = {"release_id":i, "total":float(df.loc[i])}

            snapshot["byUPC"].append(element)

        elif field == "artists":

            element = {"artist":i, "total":float(df.loc[i])}

            snapshot["byArtist"].append(element)

        elif field == "service_id":

            element = {"service":i, "total":float(df.loc[i])}

            snapshot["byDSP"].append(element)

        elif field == "territory_code":

            element = {"territory_code":i, "total":float(df.loc[i])}

            snapshot["byTerritory"].append(element)

        elif field == "track_title":

            element = {"track_title":i, "total":float(df.loc[i])}

            snapshot["byTrack"].append(element)

    return snapshot


def upload_mongo(event, final_path, snapshot, period):

    """Creates and updates snapshot to mongoDB.


    Args:

        event (dict): contains the file and client info

        final_path (str): Path where original csv is saved in s3

        snapshot (dict): it contains all fields info

        period (str): month and year when sales were processed

    Returns: Nothing

    """

    name = final_path.split("/")[-1]

    file_id = "{}_report_{}".format(event["client_id"], event["date"])

    document = {

        "file_db_id": file_id,

        "status": "finished",

        "client_id": event["client_id"],

        "name": name,

        "s3_path": final_path,

        "reporting_period": period,

        "total_local": total_local,

        "snapshot": snapshot,

        "updated_at": update_at,

        "total_rows": total_rows

    }

    c = snap_collection.replace_one({"file_db_id":file_id}, document)

    if c.matched_count == 0:

        snap_collection.insert_one(document)

    print(f"Generating Snapshot with id: {file_id}")


def search_snpashots_id(files, date):

    """Searches for all snapshots id in mongoDB."""

    date = dt.strptime(date, "%Y-%m-%d")

    ids = snap_collection.find({"name": {"$in": files}, "updated_at": date}, {"_id":1, "file_db_id":1, "name": 1})

    if ids:

        snapshots = [{"_id": i["_id"], "file_db_id":i["file_db_id"], "name": i["name"]} for i in ids]

        return snapshots

    raise ValueError("No snapshots found")


def create_snapshot(event, final_path):

    """Executes full procedure


    Args:

        event (dict): contains the file and client info

        final_path (str): Path where original csv is saved in s3

    Returns: Nothing

    """

    s3_path = "s3://{}/{}".format(event["bucket"], final_path)

    df = read_report_csv(s3_path, 1_000_000)

    fields = ["release_id", "service_id", "territory_code", "artists", "track_title"]

    # snapshot = dict({"byUPC":[], "byArtist":[], "byDSP": [], "byTerritory":[], "byTrack":[]})

    # snapshot = group_by_field(df, fields, snapshot)

    # snapshot = dict({"id":[], "file_db_id": [], "files":[]})

    snapshot = search_snpashots_id(event["files"], event["date"])

    # print(ids)

    period = df["date"].value_counts().index[0]


    if len(period) > 7:

        period = period[:7].replace("-","")

    # elif len(period) == 7:

    else:

        period = period.replace("-","")


    print("EL PERIODO ES: ", period)

    print(upload_mongo(event, final_path, snapshot, period))


src.report_generation.ReportSnapshot.create_snapshot
create_snapshot(event, final_path)
Definition ReportSnapshot.py:136

src.report_generation.ReportSnapshot.read_report_csv
read_report_csv(filename, chunk)
Definition ReportSnapshot.py:35

src.report_generation.ReportSnapshot.upload_mongo
upload_mongo(event, final_path, snapshot, period)
Definition ReportSnapshot.py:97

src.report_generation.ReportSnapshot.build_snapshot
build_snapshot(df, field, snapshot)
Definition ReportSnapshot.py:68

src.report_generation.ReportSnapshot.group_by_field
group_by_field(data, fields, snapshot)
Definition ReportSnapshot.py:53

src.report_generation.ReportSnapshot.search_snpashots_id
search_snpashots_id(files, date)
Definition ReportSnapshot.py:126