Glider
Loading...
Searching...
No Matches
TencentTemplate.py
Go to the documentation of this file.
1import re
2import awswrangler as wr
3
5
6 def preprocessing(self, filename, features, session, rel_col):
7 """Loads file and do some fixes to fit it to our standard and make it able to be processed
8
9 Args:
10 filename (str): s3 full path of filename
11 features (dict): contains delimiter, skip_rows and encoding required for current file
12 session (boto3 obj): AWS client connection
13 rel_col (str): column name used to identify upc/release_id
14 Returns: df (pandas dataframe)
15 """
16 delimiter = features["delimiter"]
17 skip_rows = features["skip_rows"]
18 encoding = features["encoding"]
19 df = wr.s3.read_csv(filename, sep=delimiter, skiprows=skip_rows, encoding=encoding, dtype={rel_col:str}, low_memory=False, boto3_session=session)
20 df[rel_col] = df.apply(lambda Row: self.fix_upc(Row[rel_col]), axis=1)
21 return df
22
23 def fix_upc(self, upc):
24 """Deletes 'UPC-' from each upc in the original file
25 Args:
26 upc (str): upc shown in the original file
27 Returns: upc (str)
28 """
29 upc = upc.replace("UPC-", "")
30 return upc
31
32 def fix_date(self, start_date):
33 """Fits the date to YYYY-MM-DD format
34 Args:
35 start_date (str): sale date shown in the original file
36 Returns: start_date_fix (str)
37 """
38 # print(start_date)
39 start_date = str(start_date)
40 start_date_fix = start_date.split("-")[0]
41 return start_date_fix
42
43 def date(self, df):
44 """Applies type functions to current dataframe
45 Args:
46 df (pandas dataframe): dataframe where changes will applied
47 Returns: df[iso_date] (pandas series)
48 """
49 # def assetType(self, df):
50 # Add a new column to insert type
51 df["iso_date"] = df.apply(lambda Row: self.fix_date(Row['start_date']), axis=1)
52 return df["iso_date"]
preprocessing(self, filename, features, session, rel_col)