Glider
Loading...
Searching...
No Matches
glider
src
importer
templates
YoutubeTemplate.py
Go to the documentation of this file.
1
import
re
2
import
awswrangler
as
wr
3
from
pandas
import
to_datetime
4
5
class
YoutubeTemplate
:
6
def
preprocessing
(self, filename, features, session, rel_col):
7
"""Loads file and do some fixes to fit it to our standard and make it able to be processed
8
9
Args:
10
filename (str): s3 full path of filename
11
features (dict): contains delimiter, skip_rows and encoding required for current file
12
session (boto3 obj): AWS client connection
13
rel_col (str): column name used to identify upc/release_id
14
Returns: df (pandas dataframe)
15
"""
16
delimiter = features[
"delimiter"
]
17
skip_rows = features[
"skip_rows"
]
18
encoding = features[
"encoding"
]
19
df = wr.s3.read_csv(filename, sep=delimiter, skiprows=skip_rows, encoding=encoding, dtype={rel_col:str}, keep_default_na=
False
, low_memory=
False
, boto3_session=session)
20
try
:
21
df[
"Month"
] = to_datetime(df[
"Month"
], format=
'%Y%m'
)
22
except
:
23
df[
"Day"
] = to_datetime(df[
"Day"
], format=
'%Y%m%d'
)
24
df[
"gettypeyt"
] = df.apply(
lambda
Row: self.
type
(Row[
'Asset Type'
]), axis=1)
25
df[
"Asset Type"
] = df[
"gettypeyt"
]
26
df.drop(
"gettypeyt"
, axis=1, inplace=
True
)
27
return
df
28
29
def
type
(self, asset_type):
30
"""Maps the sale type with the name specified by the client according their requirements
31
Args:
32
asset_type (str): sale type shown in the original file
33
Returns: (sale type in a single one letter)
34
"""
35
if
"Sound Recording"
in
asset_type:
36
return
'Youtube Content ID'
37
elif
"Art Track"
in
asset_type
or
"Music Video"
in
asset_type:
38
return
'Youtube Music'
39
# return 'Youtube'
40
41
def
date
(self, filename):
42
"""Sets date column given the filename (it contains the date)
43
For example YouTube_GSDistroPartners_M_20220801_asset_raw_v1-1.csv
44
Args:
45
filename (str): current filename
46
Returns: date_str (str)
47
"""
48
# "YouTube_GSDistroPartners_M_20220801_asset_raw_v1-1.csv"
49
date = re.findall(
r'M_20\d{6}'
, filename)[0]
50
date = date.replace(
"M_"
,
""
)
51
date_str = date[:4]+
"-"
+date[4:6]
52
# df["date_from_file"] = date_str
53
return
date_str
src.importer.templates.YoutubeTemplate.YoutubeTemplate
Definition
YoutubeTemplate.py:5
src.importer.templates.YoutubeTemplate.YoutubeTemplate.preprocessing
preprocessing(self, filename, features, session, rel_col)
Definition
YoutubeTemplate.py:6
src.importer.templates.YoutubeTemplate.YoutubeTemplate.type
type(self, asset_type)
Definition
YoutubeTemplate.py:29
src.importer.templates.YoutubeTemplate.YoutubeTemplate.date
date(self, filename)
Definition
YoutubeTemplate.py:41
Generated by
1.12.0