Glider
Loading...
Searching...
No Matches
AudioMackTemplate.py
Go to the documentation of this file.
1import re
2import awswrangler as wr
3
5 def preprocessing(self, filename, features, session, rel_col):
6 """Loads file and do some fixes to fit it to our standard and make it able to be processed
7
8 Args:
9 filename (str): s3 full path of filename
10 features (dict): contains delimiter, skip_rows and encoding required for current file
11 session (boto3 obj): AWS client connection
12 rel_col (str): column name used to identify upc/release_id
13 Returns: df (pandas dataframe)
14 """
15 delimiter = features["delimiter"]
16 skip_rows = features["skip_rows"]
17 try:
18 df = wr.s3.read_csv(filename, sep=delimiter, skiprows=skip_rows, dtype={rel_col:str}, low_memory=False, boto3_session=session)
19 except:
20 df = wr.s3.read_csv(filename, sep=delimiter, skiprows=skip_rows, low_memory=False, boto3_session=session)
21 # df = read_csv(filename, sep=delimiter, skiprows=skip_rows, encoding=encoding, low_memory=False)#, keep_default_na=False, low_memory=False)
22 return df
23
24 def date_by_month(self, filename):
25 """Sets date column given the filename (it contains the date). It's used just when month is explicit
26 For example Audiomack_Sept22.csv
27
28 Args:
29 filename (str): current filename
30 Returns: date_str (str)
31 """
32 # "Audiomack_Sept22.csv"
33 month = {"jan": "01", "feb": "02", "mar": "03", "apr": "04", "may": "05", "jun": "06",
34 "jul": "07", "aug": "08", "sep": "09", "oct": "10", "nov":"11", "dec": "12"}
35 m = re.findall(r'\_[a-z]*', filename)[0]
36 m = m.replace("_", "")
37 s = m.strip()[:3]
38 dig_month = month[s]
39 year = re.findall(r'2\d{1}', filename)[0]
40 date_str = "20"+year+"-"+dig_month+"-01"
41 return date_str
42
43
44 def date(self, filename):
45 """Sets date column given the filename (it contains the date). For example Audiomack_202207.csv
46
47 Args:
48 filename (str): current filename
49 Returns: date_str (str)
50 """
51 # "Audiomack_202207.csv" or
52 filename = filename.lower()
53 try:
54 date = re.findall(r'_20\d{4}', filename)[0]
55 date = date.replace("_", "")
56 date_str = date[:4]+"-"+date[4:]+"-01"
57 return date_str
58 except:
59 date_str = self.date_by_month(filename)
60 return date_str
preprocessing(self, filename, features, session, rel_col)