Glider
Loading...
Searching...
No Matches
itunesTemplate.py
Go to the documentation of this file.
1import re
2import awswrangler as wr
3
5 def preprocessing(self, filename, features, session, rel_col):
6 """Loads file and do some fixes to fit it to our standard and make it able to be processed
7
8 Args:
9 filename (str): s3 full path of filename
10 features (dict): contains delimiter, skip_rows and encoding required for current file
11 session (boto3 obj): AWS client connection
12 rel_col (str): column name used to identify upc/release_id
13 Returns: df (pandas dataframe)
14 """
15 delimiter = features["delimiter"]
16 skip_rows = features["skip_rows"]
17 encoding = features["encoding"]
18 df = wr.s3.read_csv(filename, sep=delimiter, skiprows=skip_rows, encoding=encoding, dtype={rel_col:str}, low_memory=False, boto3_session=session)
19 # df = read_csv(filename, sep=delimiter, skiprows=skip_rows, encoding=encoding, low_memory=False)
20 total_rows = df["End Date"][df["Start Date"]=="Total_Rows"]
21 total_rows = int(total_rows)
22 df.drop(range(total_rows, df.shape[0], 1), axis=0, inplace=True)
23 return df
24
25 def preprocessing_music(self, filename, features, session, rel_col):
26 """Loads file and do some fixes to fit it to our standard and make it able to be processed.
27 It's used for a specific itunes version
28
29 Args:
30 filename (str): s3 full path of filename
31 features (dict): contains delimiter, skip_rows and encoding required for current file
32 session (boto3 obj): AWS client connection
33 rel_col (str): column name used to identify upc/release_id
34 Returns: df (pandas dataframe)
35 """
36 delimiter = features["delimiter"]
37 skip_rows = features["skip_rows"]
38 encoding = features["encoding"]
39 df = wr.s3.read_csv(filename, sep=delimiter, skiprows=skip_rows, encoding=encoding, dtype={rel_col:str}, low_memory=False, boto3_session=session)
40 # df = read_csv(filename, sep=delimiter, skiprows=skip_rows, low_memory=False)
41 total_rows = df["Apple Identifier"][df["Storefront Name"]=="Row Count"]
42 total_rows = int(total_rows)
43 df.drop(range(total_rows, df.shape[0], 1), axis=0, inplace=True)
44 return df
45
46 def type(self, upc):
47 """Maps the sale type with the name specified by the client according their requirements
48 Args:
49 upc (str): sale type (for products) shown in the original file
50 Returns: (sale type in a single one letter)
51 """
52 if str(upc) == "" or str(upc) == "nan":
53 return "T"
54 return "A"
55
56 def assetType(self, df):
57 """Applies type functions to current dataframe
58 Args:
59 df (pandas dataframe): dataframe where changes will applied
60 Returns: df (pandas dataframe)
61 """
62 df["gettypeitunes"] = df.apply(lambda Row: self.type(Row['upc']), axis=1)
63 return df
64
65 def date(self, filename):
66 """Sets date column given the filename (it contains the date)
67 For example S1_89680172_0723_ZZ.txt
68 Args:
69 filename (str): current filename
70 Returns: date_str (str)
71 """
72 date = re.findall(r'_\d{4}_', filename)[0]
73 date = date.replace("_", "")
74 month = date[:2]
75 year = "20"+date[2:]
76 date_str = year+month+"01"
77 # df["from_file_itune"] = date_str
78 return date_str
preprocessing(self, filename, features, session, rel_col)
preprocessing_music(self, filename, features, session, rel_col)