glider-documentation/html/_process_files_8py_source.html

# import io

import os

import boto3

import sys

import traceback

import awswrangler as wr

from datetime import date

from multiprocessing import Process, Manager

from ColumnsManager import BuildOttoData

from ErrorHandler import *

from Validate import validate_data

from FormatManager import csvManager

from dotenv import load_dotenv

from UploadSales import upload_main

from MongoConnection import mongo_connection


load_dotenv()


merlin_formats = ["akazoo","alibaba","anghami","AWA", "awa", "boomplay","deezer","iheart","jiosaavn","kkbox","mixcloud","netease","pandora","slacker","soundcloud",

                    "soundtrack_your_brand","spotify","tencent","tiktok","uma","yandex", "facebook", "roxi", "triller", "resso", "peloton","snapchat",

                    "jaxsta","trebel", "youtube_merlin", "vevo", "youtube_shorts", "youtube_merlin_label", "audiblemagic",

                    "facebook_revshare", "joox", "saavn", "tiktok-miniplayer", "kkbox_v2", "soundcloud_v2", "youtube_tier"]#"spotify_discovery",


ACCESS_ID, ACCESS_KEY = os.environ.get("AWS_KEY_ID"), os.environ.get("AWS_KEY_SECRET")


mongo_conn = mongo_connection()


snap_collection = mongo_conn.mongo_conn_snapshots()


collection_name = os.environ.get("COLLECTION")


collection = mongo_conn.mongo_conn_sales()


s3_session = boto3.Session(aws_access_key_id=ACCESS_ID, aws_secret_access_key=ACCESS_KEY)


def save_parquet_s3(df, event, name):

    """Save current dataframe as parquet file in s3


    Args:

        df (pandas dataframe): processed file loaded as dataframe

        event (dict): is a dictionary with all client and sales information

        name (str): current filename

    Returns: response (str)

    """

    df["quantity"] = df["quantity"].fillna(0)

    df["total_local"] = df["total_local"].fillna(0.0)

    df["full_total_foreign"] = df["full_total_foreign"].fillna(0.0)

    df["quantity"] = df["quantity"].astype("int")

    df["total_local"] = df["total_local"].astype("float")

    df["full_total_foreign"] = df["full_total_foreign"].astype("float")

    df["catalogue_id"] = "no_catalogue_id"

    df.fillna("undefined", inplace=True)

    df.reset_index(drop=True, inplace=True)

    bucket_out = event["bucket"][1]

    path_out = event["path"][1]

    parquet_file = name+'.parquet'

    wr.s3.to_parquet(

        df=df,

        path="s3://{bucket}/{path}/{file}".format(bucket=bucket_out, path=path_out, file=parquet_file),

        boto3_session=s3_session,

        index = False

    )

    response = "{} uploaded to {}/{}".format(parquet_file, bucket_out, path_out)

    return response, df


def upload_data_mongo(df, filename):

    """ Uploads batch lines to mongodb


    Args:

        df (dataframe): Dataframe with sales matched by catalogue

    Returns: Nothing

    """

    # files_delete = [i["file"] for i in files]

    documents = collection.find_one({"file": filename})

    if documents:

        r = collection.delete_many({"file": filename})

        response = f"{r.deleted_count} sales lines deleted"

    else:

        response = "No files deleted"

    print(response)

    # lines = df.apply(lambda x: x.to_dict(), axis=1).to_list()

    lines = df.to_dict(orient="records")

    try:

        print(f"{'*'*15} Uploading Data to Mongo {'*'*15}")

        chunk_size = 500000  # Número de documentos por lote

        for start in range(0, len(lines), chunk_size):

            end = start + chunk_size

            chunk = lines[start:end]

            collection.insert_many(chunk)

            print(f"Batch: {end} finished.")

        # collection.insert_many(lines)

        print(f"Lines uploaded to {collection_name}")

    except Exception as e:

        raise Exception("There's no lines to upload")


def fix_date(date):

    """Normalize date column to YYYY-MM-DD format


    Args:

        date (datetime stamp): date column

    Returns: date (str)

    """

    date = str(date)

    date = date.replace(" 00:00:00", "")

    return date


def get_period(date, filename):

    """Generates period given the date


    Args:

        date (str): date column

    Returns: period (str)

    """

    if "KepachMusictas" in filename:

        from templates import FugaTemplate

        period = FugaTemplate().date(filename)

    else:

        date = str(date)

        if len(date) > 7:

            period = date[:7].replace("-","")

        else:

            period = date.replace("-","")

    return period


def create_new_df(event, formats, return_dict):

    """Executes full procedure per format


    Args:

        event (dict): is a dictionary with all client and sales information

        formats (str): current format to process

        return_dict (dict): is a dictionary with all processed files information


    Returns: return_dict (dict)

    """

    # write_DF = get_DF_otto()

    no_processed=dict()

    final_files = event["format"][formats]["files"][:]

    for file in event["format"][formats]["files"]:

        try:

            print(file)

            data_complete, status_data = csvManager.df_csv(event, formats, file, s3_session, BuildOttoData())

            print(status_data)

            if status_data != "Ok":

                raise FileNotLoaded()

            # Data validation

            vd = validate_data()

            data_complete, status_valid = vd.validation(data_complete)

            if status_valid != "Ok":

                raise ValidationFailed()

            # Add accounting date column

            if "currencies not found" in BuildOttoData().add_exchange(data_complete, event["currency"], event["currency_exception"]):

                raise CurrencyNotFound()


            data_complete = BuildOttoData().add_info_columns(data_complete, file["file"], event["client_id"], event["base_currency"])


            # print("{FILE} file total previous: {TOTAL}".format(FILE=file["file"], TOTAL=data_complete["total_local"].sum()))

            # if formats in merlin_formats:

            #     print("Making discount for {}".format(file["file"]))

            #     write_DF.discount(data_complete)


            # Add period and date columns

            data_complete["period"] = data_complete.apply(lambda Row: get_period(Row["date"], file["file"]), axis=1)

            data_complete["date"] = data_complete.apply(lambda Row: fix_date(Row["date"]), axis=1)

            print(data_complete.head())

            p_response, data_complete = save_parquet_s3(data_complete, event, file["file"])

            print(p_response)

            snap_collection.update_one({"file_db_id":file["file_id"]},{"$set":{"status":"processed"}})

            upload_data_mongo(data_complete, file["file"])

        except Exception as e:

            print(sys.exc_info()[2])

            print(traceback.format_exc())

            m = "{}\n{}".format(sys.exc_info()[2], traceback.format_exc())

            file_db_id = file["file_id"]

            eh  = ErrorHandler()

            error = eh.handle(e, m, file_db_id)

            no_processed[file_db_id]={"file":file["file"], "error": error}

            final_files.remove(file)

            continue


    event["format"][formats].pop("columns")

    return_dict["status"] = "OK"

    return_dict["tag"] = event["tag"]

    return_dict["cat_gen"] = event["cat_gen"]

    return_dict["cat_match"] = event["cat_match"]

    return_dict["results_bucket"] = event["bucket"][1]

    return_dict["results_path"] = event["path"][1]

    return_dict["client_id"] = event["client_id"]

    return_dict["format_{}".format(formats)] = final_files

    return_dict["no_processed_{}".format(formats)] = no_processed


    return return_dict


def process_files_parallel(event, context=None):

    """Executes full procedure using multiprocessing to process several formats at the same time


    Args:

        event (dict): is a dictionary with all client and sales information

        context (none): it's required just for lambda execution

    Returns: final_output (dict)

    """

    if len(event["format"]) == 0:

        return {"status": "No Files to Process"}

    manager = Manager()

    return_dict = manager.dict()

    jobs = []

    for formats in event["format"]:

        p = Process(target=create_new_df, args=(event, formats, return_dict, ))

        jobs.append(p)

        p.start()

    for proc in jobs:

        proc.join()

    return_dict = dict(return_dict)

    final_output = dict()

    final_output["format"] = dict()

    for i in return_dict:

        if "format_" in i:

            name = i.split("format_")[1]

            final_output["format"][name] = {"files": return_dict[i]}

        else:

            final_output[i] = return_dict[i]

    final_output["no_format_identified"] = event["no_format_identified"]

    # try:

    #     # Matches and uploads sales to MongoDB

    #     upload_main(final_output, s3_session)

    # except:

    #     print("Catalogue Matching has failed")

    #     print(sys.exc_info()[2])

    #     print(traceback.format_exc())

    #     pass

    return final_output


src.importer.ColumnsManager.BuildOttoData
Definition ColumnsManager.py:6

src.importer.ErrorHandler.CurrencyNotFound
Definition ErrorHandler.py:83

src.importer.ErrorHandler.ErrorHandler
Definition ErrorHandler.py:9

src.importer.ErrorHandler.FileNotLoaded
Definition ErrorHandler.py:75

src.importer.ErrorHandler.ValidationFailed
Definition ErrorHandler.py:91

src.importer.MongoConnection.mongo_connection
Definition MongoConnection.py:8

src.importer.Validate.validate_data
Definition Validate.py:6

src.importer.ProcessFiles.upload_data_mongo
upload_data_mongo(df, filename)
Definition ProcessFiles.py:69

src.importer.ProcessFiles.process_files_parallel
process_files_parallel(event, context=None)
Definition ProcessFiles.py:196

src.importer.ProcessFiles.fix_date
fix_date(date)
Definition ProcessFiles.py:99

src.importer.ProcessFiles.get_period
get_period(date, filename)
Definition ProcessFiles.py:110

src.importer.ProcessFiles.create_new_df
create_new_df(event, formats, return_dict)
Definition ProcessFiles.py:128

src.importer.ProcessFiles.save_parquet_s3
save_parquet_s3(df, event, name)
Definition ProcessFiles.py:39