29 """Loads file and do some fixes to fit it to our standard and make it able to be processed
32 filename (str): s3 full path of filename
33 features (dict): contains delimiter, skip_rows and encoding required for current file
34 session (boto3 obj): AWS client connection
35 rel_col (str): column name used to identify upc/release_id
36 Returns: df (pandas dataframe)
38 delimiter = features[
"delimiter"]
39 skip_rows = features[
"skip_rows"]
40 encoding = features[
"encoding"]
41 df = wr.s3.read_csv(filename, sep=delimiter, skiprows=skip_rows, encoding=encoding, dtype={rel_col:str}, boto3_session=session)
43 values = {
"Asset Quantity": 0,
"Product Quantity": 0,
"Asset Artist":
"",
"Asset ISRC":
"undefined"}
44 df.fillna(value=values, inplace=
True)
45 df[
"Asset Quantity"] = df.apply(
lambda Row: self.
QuantityPreprocessing(Row[
'Product Quantity'], Row[
'Asset Quantity']), axis=1)
46 df[
"Asset Artist"] = df.apply(
lambda Row: self.
ArtistPreprocessing(Row[
'Product Artist'], Row[
'Asset Artist']), axis=1)
73 """Sets date column given the filename (it contains the date)
74 For example December2021StatementRun_KepachMusictasCAMJazz-royalty_product_and_asset.csv
76 filename (str): current filename
77 Returns: date_str (str)
79 filename = filename.lower()
80 possible_months = [
"january",
"february",
"march",
"april",
"may",
"june",
81 "july",
"august",
"september",
"october",
"november",
"december"]
82 for m
in possible_months:
85 date = filename.split(month)
88 year = [str(s)
for s
in re.findall(
r'-?\d+\.?\d*', date)][0]
89 date_month = datetime.strptime(month,
"%B")
90 date_month = str(date_month.month)
91 if len(date_month) == 1:
92 date_month =
"0"+date_month
93 date_str = year+date_month