Glider
Loading...
Searching...
No Matches
SoundtrackTemplate.py
Go to the documentation of this file.
1import re
2import awswrangler as wr
3
5 def preprocessing(self, filename, features, session, rel_col):
6 """Loads file and do some fixes to fit it to our standard and make it able to be processed
7
8 Args:
9 filename (str): s3 full path of filename
10 features (dict): contains delimiter, skip_rows and encoding required for current file
11 session (boto3 obj): AWS client connection
12 rel_col (str): column name used to identify upc/release_id
13 Returns: df (pandas dataframe)
14 """
15 delimiter = features["delimiter"]
16 skip_rows = features["skip_rows"]
17 encoding = features["encoding"]
18 df = wr.s3.read_csv(filename, sep=delimiter, skiprows=skip_rows, encoding=encoding, dtype={rel_col:str}, keep_default_na=False, low_memory=False, boto3_session=session)
19 # df = read_csv(filename, sep=delimiter, skiprows=skip_rows, keep_default_na=False, low_memory=False)
20 df["Country_Of_Sale"] = df["Country_Of_Sale"].apply(lambda x: str(x).replace(" ", ""))
21 return df
22
23 def date(self, filename):
24 """Sets date column given the filename (it contains the date)
25 For example gyrostream-pty-ltd_soundtrack-your-brand_202203_Monthly-Sales.csv
26 Args:
27 filename (str): current filename
28 Returns: date_str (str)
29 """
30 # "gyrostream-pty-ltd_soundtrack-your-brand_202203_Monthly-Sales.csv"
31 date = re.findall(r'_\d{6}_', filename)[0]
32 date = date.replace("_", "")
33 date_str = date[:4]+"-"+date[4:]+"-01"
34 # df["date_from_file"] = date_str
35 return date_str
preprocessing(self, filename, features, session, rel_col)