glider-documentation/html/generator_8py_source.html

import os

import io

import json

import copy

import boto3

import pandas as pd

import ssl

from pymongo import MongoClient

from dotenv import load_dotenv

from pathlib import Path

import sys

import traceback


# dotenv_path = Path("src/importer/.env")

load_dotenv()


"""

This class has the functionality to grab a specific group of sales files

in OTTO format from S3 and generate a JSON catalogue file.

"""

s3_file_obj = None


class EmptyException(Exception):

    pass


class Generator:


    """

        args:

            s3_path: an array with path to files to generate the catalogue from.

        Returns: Nothing.

        raises: EmptyException if initialized empty.

    """

    # def __init__(self, s3_path):

    #     self.s3_path = s3_path

    #     if not isinstance(s3_path, list):

    #         raise EmptyException('No s3 path passed/list required')


    def get_data(self, key, bucket, s3_client):

        if s3_client is None:

            s3_client = boto3.client('s3')

        obj = s3_client.get_object(Bucket=bucket, Key=key)

        return pd.read_parquet(io.BytesIO(obj['Body'].read()))


    """

        args:

            AWS acccesses.

            Bucket in which results are saved.

            Path an array with path to files to generate the catalogue from.

        returns: a pandas dataframe

    """


    def getDf(self, ACCESS_ID, ACCESS_KEY, RESULTS_BUCKET, path):

        s3_client = boto3.client("s3", aws_access_key_id=ACCESS_ID, aws_secret_access_key=ACCESS_KEY)

        df_pl = pd.DataFrame()

        response = s3_client.list_objects(Bucket = RESULTS_BUCKET, Prefix=path)

        if len(response["Contents"]) > 2:

            # today = datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')

            get_last_modified = lambda x: int(x['LastModified'].strftime('%s'))

            last_added = [obj['Key'] for obj in sorted(response["Contents"], key=get_last_modified) if "/" not in obj][-1]

            print("The last file added is {}".format(last_added))

            df = self.get_data(last_added,RESULTS_BUCKET,s3_client)

            # return df

        else:

            for file in response["Contents"]:

                if file["Key"].endswith("/"):

                    continue

                print("File {} is gonna generate the catalog".format(file["Key"]))

                df = self.get_data(file["Key"],RESULTS_BUCKET,s3_client)

                # print(df)

                # df_pl = pd.concat([df,df_pl])

        df.fillna("undefined", inplace=True)

        return df


    """

        args:

            list of values

        returns: unique values in list df["date"].replace("2022-01-28T00:00:00.000Z", "2022-02-28T00:00:00.000Z", inplace=True)

    """


    def unique_values_from_isrc(self, df, release, isrc, field):

        unique = []

        for i in isrc:

            value = df[(df.release_id==release) & (df.isrc_id==i)][field]

            unique.append(value.unique()[0])

        return unique


    """

        args:

            AWS acccesses.

            Bucket in which results are saved.

            Path an array with path to files to generate the catalogue from.

        returns: a Pandas dataframe with grouped by release ID only with only isrc/release title/track title/label/artist columns

    """


    def generate(self, ACCESS_ID, ACCESS_KEY, RESULTS_BUCKET, path, client):

        df_pl = self.getDf(ACCESS_ID, ACCESS_KEY, RESULTS_BUCKET, path)

        columns_to_drop = ['quantity', 'total_foreign', 'currency', 'date', 'type', 'service_id', 'filename', 'exchange_rate', 'territory_code', 'total_local', 'track_title', 'release_title']

        columns_to_convert = ["release_title", "track_title", "label_id", "artists"]

        aggregated = df_pl.groupby('release_id')["isrc_id"].apply(list).reset_index(name="isrc_id")

        aggregated["isrc_id"] = aggregated.apply(lambda Row: list(set(Row["isrc_id"])), axis=1)

        for col in columns_to_convert:

            aggregated[col] = aggregated.apply(lambda Row: self.unique_values_from_isrc(df_pl, Row["release_id"],Row["isrc_id"], col), axis=1)

        aggregated["client_id"] = client

        #generate returns something simil1ar to:

        """8052405144171

        {"release_id":"8024709032027",

        "isrc_id":["US29B0747640","US29B0747642","US29B0747652","US29B0747644","US29B0747656","US29B0747646","US29B0747650","US29B0747648","US29B0748238","US29B0747654"],

        "release_title":["L'Hymne A L'Amour","L'Hymne A L'Amour","L'Hymne A L'Amour","L'Hymne A L'Amour","L'Hymne A L'Amour","L'Hymne A L'Amour","L'Hymne A L'Amour","L'Hymne A L'Amour","L'Hymne A L'Amour","L'Hymne A L'Amour"],

        "track_title":["Il Postino","Para Jobim","Soledad","Romance Del Diablo","L'Hymne A L'Amour","Waltz For Debby","Triunfal","Milonga Is Coming","Sinfonia in G minor, BWV 797","Operation Tango"],

        "label_id":["CAM JAZZ","CAM JAZZ","CAM JAZZ","CAM JAZZ","CAM JAZZ","CAM JAZZ","CAM JAZZ","CAM JAZZ","CAM JAZZ","CAM JAZZ"],

        "artists":["Richard Galliano featuring Gary Burton","Richard Galliano featuring Gary Burton","Richard Galliano featuring Gary Burton","Richard Galliano featuring Gary Burton","Richard Galliano featuring Gary Burton","Richard Galliano featuring Gary Burton","Richard Galliano featuring Gary Burton","Richard Galliano featuring Gary Burton","Richard Galliano featuring Gary Burton","Richard Galliano featuring Gary Burton"]}

        """

        return aggregated


    """

        args:

        json line to update.

        Collection were catalogue is saved

        Returns: Nothing Just update or uploading new items to catalog

    """


    def update_data(self, line, collection):

        release_found = collection.find_one({"release_id": line["release_id"]})

        if release_found:

            print("UPDATING {} DATA TO MONGO".format(line["release_id"]))

            release_copy = copy.deepcopy(release_found)

            release_copy["isrc_id"].extend([element for element in line["isrc_id"] if element not in release_copy["isrc_id"]])

            itemns_to_add = [line["isrc_id"].index(element) for element in line["isrc_id"] if element not in release_found["isrc_id"]]

            release_found["isrc_id"].extend([line["isrc_id"][element] for element in itemns_to_add])

            release_found["release_title"].extend([line["release_title"][element] for element in itemns_to_add])

            release_found["track_title"].extend([line["track_title"][element] for element in itemns_to_add])

            release_found["label_id"].extend([line["label_id"][element] for element in itemns_to_add])

            release_found["artists"].extend([line["artists"][element] for element in itemns_to_add])

            collection.update_one({"release_id": line["release_id"]},{"$set": {"isrc_id":release_found["isrc_id"],

                                                                            "release_title":release_found["release_title"],

                                                                            "track_title":release_found["track_title"],

                                                                            "label_id":release_found["label_id"],

                                                                            "artists":release_found["artists"],

                                                                            "client_id":line["client_id"]}})

        else:

            print("UPLOADING {} DATA TO MONGO".format(line["release_id"]))

            collection.insert_one(line)


    """

        args:

        pandas dataframe

        Returns: Nothing. Makes mongo connection and applies update_data function to all dataframe

    """


    def upload_catalog(self, df):

        uri = os.environ.get("MONGO_GLIDER")

        db_name = os.environ.get("DB")

        collection_name = os.environ.get("CAT_COLLECTION")

        client = MongoClient(uri, ssl_cert_reqs=ssl.CERT_NONE)

        db = client[db_name]

        collection = db[collection_name]

        jsonfile = df.to_json(orient="records")

        df_json = json.loads(jsonfile)

        # df_json = df.apply(lambda x: x.to_dict(), axis=1).to_list()

        # print(df_json)

        [self.update_data(x, collection) for x in df_json]


"""

    args:

        lambda default args

    returns: Prints the JSON version (TODO: save to file)

"""


def lambda_handler(event, context):

    if event["status"] != 'Snapshots Created':

        return {"status": "Catalog could not be updated"}

    ID = os.environ.get("AWS_KEY_ID")

    KEY = os.environ.get("AWS_SECRET_KEY")

    # RESULTS_BUCKET = os.environ.get("RESULTS_BUCKET")

    RESULTS_BUCKET = event["results_bucket"]

    RESULTS_PATH = event["results_path"]

    CLIENT_ID = event["client_id"]

    try:

        catalog_generator = Generator()

        aggregated = catalog_generator.generate(ID, KEY, RESULTS_BUCKET, RESULTS_PATH, CLIENT_ID)

        print(aggregated)

        json_df = aggregated.to_json(orient="records")

        print(json_df)

        catalog_generator.upload_catalog(aggregated)

    except Exception as e:

        print(sys.exc_info()[2])

        print(traceback.format_exc())

        return {"status": str(e)}

    else:

        return {"status": "Catalog updated"}


src.importer.generator.EmptyException
Definition generator.py:22

src.importer.generator.Generator
Definition generator.py:25

src.importer.generator.Generator.get_data
get_data(self, key, bucket, s3_client)
Definition generator.py:38

src.importer.generator.Generator.generate
generate(self, ACCESS_ID, ACCESS_KEY, RESULTS_BUCKET, path, client)
Definition generator.py:90

src.importer.generator.Generator.upload_catalog
upload_catalog(self, df)
Definition generator.py:145

src.importer.generator.Generator.unique_values_from_isrc
unique_values_from_isrc(self, df, release, isrc, field)
Definition generator.py:77

src.importer.generator.Generator.getDf
getDf(self, ACCESS_ID, ACCESS_KEY, RESULTS_BUCKET, path)
Definition generator.py:51

src.importer.generator.Generator.update_data
update_data(self, line, collection)
Definition generator.py:118

src.importer.generator.lambda_handler
lambda_handler(event, context)
Definition generator.py:163