Glider
Loading...
Searching...
No Matches
AmazonTemplate.py
Go to the documentation of this file.
1import re
2import awswrangler as wr
3
5 def preprocessing(self, filename, features, session, rel_col):
6 """Loads file and do some fixes to fit it to our standard and make it able to be processed
7
8 Args:
9 filename (str): s3 full path of filename
10 features (dict): contains delimiter, skip_rows and encoding required for current file
11 session (boto3 obj): AWS client connection
12 rel_col (str): column name used to identify upc/release_id
13 Returns: df (pandas dataframe)
14 """
15 delimiter = features["delimiter"]
16 skip_rows = features["skip_rows"]
17 encoding = features["encoding"]
18 df = wr.s3.read_csv(filename, sep=delimiter, skiprows=skip_rows, encoding=encoding, dtype={rel_col:str}, keep_default_na=False, low_memory=False, boto3_session=session)
19 # df = read_csv(filename, sep=delimiter, skiprows=skip_rows, encoding=encoding, keep_default_na=False, low_memory=False)
20 df.drop(df.index[-1], inplace=True)
21 # df["Units"].fillna(0, inplace=True)
22 return df
23
24 def assetCurrency(self, df, filename):
25 """Sets currency column given the filename (it contains the currency)
26 For example ZQGRO_Monthly_ADS_Usage_202112_EU.txt
27 Args:
28 filename (str): current filename
29 df (pandas dataframe): dataframe where changes will applied
30 Returns: df (pandas dataframe)
31 """
32 currencies = {"AT": "EUR", "ES": "EUR", "FR": "EUR", "GB": "GBP", "IT": "EUR", "JP": "JPY", "EU": "EUR", "FE": "USD", "NA": "USD", "US": "USD", "IN": "USD", "DE": "EUR"}
33 currency = filename.split(".")
34 currency = currency[0][-2:]
35 df["currency_from_filename"] = currencies[currency]
36 return df
37
38
39 def date(self, filename):
40 """Sets date column given the filename (it contains the date)
41 For example ZQGRO_Monthly_ADS_Usage_202112_EU.txt
42 Args:
43 filename (str): current filename
44 Returns: date_str (str)
45 """
46 # "ZQGRO_Monthly_ADS_Usage_202112_EU.txt"
47 date = re.findall(r'_20\d{4}_', filename)[0]
48 date = date.replace("_", "")
49 date_str = date[:4]+"-"+date[4:]+"-01"
50 # df["date_from_file"] = date_str
51 return date_str
preprocessing(self, filename, features, session, rel_col)