glider-documentation/html/_catalogue_generator_8py_source.html

import os

import io

import json

import copy

import boto3

import pandas as pd

import ssl

from pymongo import MongoClient

from dotenv import load_dotenv

from pathlib import Path

import sys

import traceback


# dotenv_path = Path("src/importer/.env")

load_dotenv()


"""

This class has the functionality to grab a specific group of sales files

in OTTO format from S3 and generate a JSON catalogue file.

"""

s3_file_obj = None


class EmptyException(Exception):

    pass


class Generator:


    """

        args:

            s3_path: an array with path to files to generate the catalogue from.

        Returns: Nothing.

        raises: EmptyException if initialized empty.

    """

    # def __init__(self, s3_path):

    #     self.s3_path = s3_path

    #     if not isinstance(s3_path, list):

    #         raise EmptyException('No s3 path passed/list required')


    def get_data(self, key, bucket, s3_client):

        """

            args:

                AWS acccesses.

                Bucket in which results are saved.

                Path an array with path to files to generate the catalogue from.

            returns: a pandas dataframe

        """

        if s3_client is None:

            s3_client = boto3.client('s3')

        obj = s3_client.get_object(Bucket=bucket, Key=key)

        return pd.read_parquet(io.BytesIO(obj['Body'].read()))


    def getDf(self, ACCESS_ID, ACCESS_KEY, RESULTS_BUCKET, path):

        """

            args:

                list of values

            returns: unique values in list df["date"].replace("2022-01-28T00:00:00.000Z", "2022-02-28T00:00:00.000Z", inplace=True)

        """

        s3_client = boto3.client("s3", aws_access_key_id=ACCESS_ID, aws_secret_access_key=ACCESS_KEY)

        df_pl = pd.DataFrame()

        response = s3_client.list_objects(Bucket = RESULTS_BUCKET, Prefix=path)

        if len(response["Contents"]) > 2:

            # today = datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')

            get_last_modified = lambda x: int(x['LastModified'].strftime('%s'))

            last_added = [obj['Key'] for obj in sorted(response["Contents"], key=get_last_modified) if "/" not in obj][-1]

            print("The last file added is {}".format(last_added))

            df = self.get_data(last_added,RESULTS_BUCKET,s3_client)

            # return df

        else:

            for file in response["Contents"]:

                if file["Key"].endswith("/"):

                    continue

                print("File {} is gonna generate the catalog".format(file["Key"]))

                df = self.get_data(file["Key"],RESULTS_BUCKET,s3_client)

                # print(df)

                # df_pl = pd.concat([df,df_pl])

        df.fillna("undefined", inplace=True)

        return df


    def unique_values_from_isrc(self, df, release, isrc, field):

        """

            args:

                AWS acccesses.

                Bucket in which results are saved.

                Path an array with path to files to generate the catalogue from.

            returns: a Pandas dataframe with grouped by release ID only with only isrc/release title/track title/label/artist columns

        """

        unique = []

        for i in isrc:

            value = df[(df.release_id==release) & (df.isrc_id==i)][field]

            unique.append(value.unique()[0])

        return unique


    def generate(self, ACCESS_ID, ACCESS_KEY, RESULTS_BUCKET, path, client):

        """

            args:

            json line to update.

            Collection were catalogue is saved

            Returns: Nothing Just update or uploading new items to catalog

        """

        df_pl = self.getDf(ACCESS_ID, ACCESS_KEY, RESULTS_BUCKET, path)

        columns_to_drop = ['quantity', 'total_foreign', 'currency', 'date', 'type', 'service_id', 'filename', 'exchange_rate', 'territory_code', 'total_local', 'track_title', 'release_title']

        columns_to_convert = ["release_title", "track_title", "label_id", "artists"]

        aggregated = df_pl.groupby('release_id')["isrc_id"].apply(list).reset_index(name="isrc_id")

        aggregated["isrc_id"] = aggregated.apply(lambda Row: list(set(Row["isrc_id"])), axis=1)

        for col in columns_to_convert:

            aggregated[col] = aggregated.apply(lambda Row: self.unique_values_from_isrc(df_pl, Row["release_id"],Row["isrc_id"], col), axis=1)

        aggregated["client_id"] = client

        #generate returns something simil1ar to:

        """8052405144171

        {"release_id":"8024709032027",

        "isrc_id":["US29B0747640","US29B0747642","US29B0747652","US29B0747644","US29B0747656","US29B0747646","US29B0747650","US29B0747648","US29B0748238","US29B0747654"],

        "release_title":["L'Hymne A L'Amour","L'Hymne A L'Amour","L'Hymne A L'Amour","L'Hymne A L'Amour","L'Hymne A L'Amour","L'Hymne A L'Amour","L'Hymne A L'Amour","L'Hymne A L'Amour","L'Hymne A L'Amour","L'Hymne A L'Amour"],

        "track_title":["Il Postino","Para Jobim","Soledad","Romance Del Diablo","L'Hymne A L'Amour","Waltz For Debby","Triunfal","Milonga Is Coming","Sinfonia in G minor, BWV 797","Operation Tango"],

        "label_id":["CAM JAZZ","CAM JAZZ","CAM JAZZ","CAM JAZZ","CAM JAZZ","CAM JAZZ","CAM JAZZ","CAM JAZZ","CAM JAZZ","CAM JAZZ"],

        "artists":["Richard Galliano featuring Gary Burton","Richard Galliano featuring Gary Burton","Richard Galliano featuring Gary Burton","Richard Galliano featuring Gary Burton","Richard Galliano featuring Gary Burton","Richard Galliano featuring Gary Burton","Richard Galliano featuring Gary Burton","Richard Galliano featuring Gary Burton","Richard Galliano featuring Gary Burton","Richard Galliano featuring Gary Burton"]}

        """

        return aggregated


    def update_data(self, line, collection):

        """

            args:

            pandas dataframe

            Returns: Nothing. Makes mongo connection and applies update_data function to all dataframe

        """

        release_found = collection.find_one({"release_id": line["release_id"]})

        if release_found:

            print("UPDATING {} DATA TO MONGO".format(line["release_id"]))

            release_copy = copy.deepcopy(release_found)

            release_copy["isrc_id"].extend([element for element in line["isrc_id"] if element not in release_copy["isrc_id"]])

            itemns_to_add = [line["isrc_id"].index(element) for element in line["isrc_id"] if element not in release_found["isrc_id"]]

            release_found["isrc_id"].extend([line["isrc_id"][element] for element in itemns_to_add])

            release_found["release_title"].extend([line["release_title"][element] for element in itemns_to_add])

            release_found["track_title"].extend([line["track_title"][element] for element in itemns_to_add])

            release_found["label_id"].extend([line["label_id"][element] for element in itemns_to_add])

            release_found["artists"].extend([line["artists"][element] for element in itemns_to_add])

            collection.update_one({"release_id": line["release_id"]},{"$set": {"isrc_id":release_found["isrc_id"],

                                                                            "release_title":release_found["release_title"],

                                                                            "track_title":release_found["track_title"],

                                                                            "label_id":release_found["label_id"],

                                                                            "artists":release_found["artists"],

                                                                            "client_id":line["client_id"]}})

        else:

            print("UPLOADING {} DATA TO MONGO".format(line["release_id"]))

            collection.insert_one(line)


    def upload_catalog(self, df):

        """

            args:

                lambda default args

            returns: Prints the JSON version (TODO: save to file)

        """

        uri = os.environ.get("MONGO_GLIDER")

        db_name = os.environ.get("DB")

        collection_name = os.environ.get("CAT_COLLECTION")

        try:

            client = MongoClient(uri, ssl_cert_reqs=ssl.CERT_NONE)

        except:

            client = MongoClient(uri)

        db = client[db_name]

        collection = db[collection_name]

        jsonfile = df.to_json(orient="records")

        df_json = json.loads(jsonfile)

        # df_json = df.apply(lambda x: x.to_dict(), axis=1).to_list()

        # print(df_json)

        [self.update_data(x, collection) for x in df_json]


def generate_catalogue(event, context):

    """

        args:

            event (dict): Is a Dictionary with all client and sales information

        returns: Status of procedure

    """

    if event["status"] != 'Snapshots Created':

        return {"status": "Catalog could not be updated"}

    ID = os.environ.get("AWS_KEY_ID")

    KEY = os.environ.get("AWS_SECRET_KEY")

    # RESULTS_BUCKET = os.environ.get("RESULTS_BUCKET")

    RESULTS_BUCKET = event["results_bucket"]

    RESULTS_PATH = event["results_path"]

    CLIENT_ID = event["client_id"]

    try:

        catalog_generator = Generator()

        aggregated = catalog_generator.generate(ID, KEY, RESULTS_BUCKET, RESULTS_PATH, CLIENT_ID)

        print(aggregated)

        json_df = aggregated.to_json(orient="records")

        print(json_df)

        catalog_generator.upload_catalog(aggregated)

    except Exception as e:

        print(sys.exc_info()[2])

        print(traceback.format_exc())

        return {"status": str(e)}

    else:

        return {"status": "Catalog updated"}


src.importer.CatalogueGenerator.EmptyException
Definition CatalogueGenerator.py:22

src.importer.CatalogueGenerator.Generator
Definition CatalogueGenerator.py:25

src.importer.CatalogueGenerator.Generator.generate
generate(self, ACCESS_ID, ACCESS_KEY, RESULTS_BUCKET, path, client)
Definition CatalogueGenerator.py:92

src.importer.CatalogueGenerator.Generator.unique_values_from_isrc
unique_values_from_isrc(self, df, release, isrc, field)
Definition CatalogueGenerator.py:78

src.importer.CatalogueGenerator.Generator.getDf
getDf(self, ACCESS_ID, ACCESS_KEY, RESULTS_BUCKET, path)
Definition CatalogueGenerator.py:51

src.importer.CatalogueGenerator.Generator.upload_catalog
upload_catalog(self, df)
Definition CatalogueGenerator.py:147

src.importer.CatalogueGenerator.Generator.update_data
update_data(self, line, collection)
Definition CatalogueGenerator.py:120

src.importer.CatalogueGenerator.Generator.get_data
get_data(self, key, bucket, s3_client)
Definition CatalogueGenerator.py:38

src.importer.CatalogueGenerator.generate_catalogue
generate_catalogue(event, context)
Definition CatalogueGenerator.py:168