glider-documentation/html/_audio_mack_template_8py_source.html

import re

import awswrangler as wr


class AudioMackTemplate:


    def preprocessing(self, filename, features, session, rel_col):

        """Loads file and do some fixes to fit it to our standard and make it able to be processed


        Args:

            filename (str): s3 full path of filename

            features (dict): contains delimiter, skip_rows and encoding required for current file

            session (boto3 obj): AWS client connection

            rel_col (str): column name used to identify upc/release_id

        Returns: df (pandas dataframe)

        """

        delimiter = features["delimiter"]

        skip_rows = features["skip_rows"]

        try:

            df = wr.s3.read_csv(filename, sep=delimiter, skiprows=skip_rows, dtype={rel_col:str}, low_memory=False, boto3_session=session)

        except:

            df = wr.s3.read_csv(filename, sep=delimiter, skiprows=skip_rows, low_memory=False, boto3_session=session)

        # df = read_csv(filename, sep=delimiter, skiprows=skip_rows, encoding=encoding, low_memory=False)#, keep_default_na=False, low_memory=False)

        return df


    def date_by_month(self, filename):

        """Sets date column given the filename (it contains the date). It's used just when month is explicit

           For example Audiomack_Sept22.csv


        Args:

            filename (str): current filename

        Returns: date_str (str)

        """

        # "Audiomack_Sept22.csv"

        month = {"jan": "01", "feb": "02", "mar": "03", "apr": "04", "may": "05", "jun": "06",

                        "jul": "07", "aug": "08", "sep": "09", "oct": "10", "nov":"11", "dec": "12"}

        m = re.findall(r'\_[a-z]*', filename)[0]

        m = m.replace("_", "")

        s = m.strip()[:3]

        dig_month = month[s]

        year = re.findall(r'2\d{1}', filename)[0]

        date_str = "20"+year+"-"+dig_month+"-01"

        return date_str


    def date(self, filename):

        """Sets date column given the filename (it contains the date). For example Audiomack_202207.csv


        Args:

            filename (str): current filename

        Returns: date_str (str)

        """

        # "Audiomack_202207.csv" or

        filename = filename.lower()

        try:

            date = re.findall(r'_20\d{4}', filename)[0]

            date = date.replace("_", "")

            date_str = date[:4]+"-"+date[4:]+"-01"

            return date_str

        except:

            date_str = self.date_by_month(filename)

            return date_str


src.importer.templates.AudioMackTemplate.AudioMackTemplate
Definition AudioMackTemplate.py:4

src.importer.templates.AudioMackTemplate.AudioMackTemplate.date
date(self, filename)
Definition AudioMackTemplate.py:44

src.importer.templates.AudioMackTemplate.AudioMackTemplate.preprocessing
preprocessing(self, filename, features, session, rel_col)
Definition AudioMackTemplate.py:5

src.importer.templates.AudioMackTemplate.AudioMackTemplate.date_by_month
date_by_month(self, filename)
Definition AudioMackTemplate.py:24