DATA-Code 1-050624-120338

Code 1
1 class RunPipelineCasinoVentes(RunPipeline):
2 retailer_name = CASINO_RETAILER_NAME
3 schema_input = SCHEMA
4
5 def processing_data(self, element):
6 element = self.add_technical_fields_processing(element, element)
7 typologie = element[VentesTargetOutput.typologie].strip()
8 is_regex = re.search(r"^(322247|360282).+", element[VentesTargetOutput.ean])
9 if not is_regex and typologie == "2":
10 element[VentesTargetOutput.typologie] = "MN"
11 elif not is_regex and typologie == "1":
12 element[VentesTargetOutput.typologie] = "MDD"
13 elif is_regex and typologie in ("0", "1"):
14 element[VentesTargetOutput.typologie] = "MDD"
15 else:
16 element[VentesTargetOutput.typologie] = None
17
18 PREFIX_S = "S"
19 code_fournisseur = element[VentesTargetOutput.code_fournisseur]
20 if code_fournisseur is not None:
21 code_fournisseur = code_fournisseur.strip()
22 if not code_fournisseur.startswith(PREFIX_S):
23 code_fournisseur = PREFIX_S + code_fournisseur
24 element[VentesTargetOutput.code_fournisseur] = code_fournisseur[:7]
25 return element
26
27 @staticmethod
28 def preprocessing_data(element, schema_input):
29 return preprocessing_common(element, schema_input)
30
31 def get_reader(self):
32 return CSVReader(encoding=self.encoding)
33
34 def apply_processing_data(self, preprocessed_data, pipeline):
35 keys = (
36 VentesTargetOutput.annee_mois,
37 VentesTargetOutput.ean,
38 VentesTargetOutput.code_fournisseur,
39 VentesTargetOutput.nom_fournisseur,
40 VentesTargetOutput.promo,
41 VentesTargetOutput.code_interne,
42 VentesTargetOutput.code_ue,
43 VentesTargetOutput.code_famille,
44 VentesTargetOutput.libelle_pdt,
45 VentesTargetOutput.typologie,
46 )
47 fields_to_sum = (
48 VentesTargetOutput.ca_ventes_ht,
49 VentesTargetOutput.ca_ventes_ttc,
50 VentesTargetOutput.ca_srp,
51 VentesTargetOutput.volume,
52 )
53 individual_fields = VentesTargetOutput().get_technical_fields_preprocessing()
54 processing = (
55 preprocessed_data
56 | NORMALIZED_COLUMNS
57 >> beam.Map(
58 self.get_normalized_columns,
59 schema_input=self.schema_input,
60 individual_fields=individual_fields,
61 )
62 | sum_values(keys, fields_to_sum, individual_fields)
63 | "processing" >> beam.Map(self.processing_data)
64 )
65 return processing
66
1 class RunPipeline:
2 retailer_name = None
3 is_processing = True
4 schema_input = None
5
6 def __init__(
7 self,
8 input_data: str,
9 options: PipelineOptions,
10 date_file: str,
11 debug_mode: bool,
12 encoding: str,
13 table_name_preprocessing: str,
14 table_name_processing: str,
15 ):
16 self.input_data = input_data
17 self.options = options
18 self.date_file = date_file
19 self.debug_mode = debug_mode
20 self.encoding = encoding
21 self.table_name_preprocessing = table_name_preprocessing
22 self.table_name_processing = table_name_processing
23 self.is_file = True if self.input_data.startswith("gs://") else False
24
25 def keep_idempotence(self, pipeline):
26 empty_pcollection = pipeline | "CreateEmptyPBegin" >> beam.Create([self.input_data])
27 if self.is_file:
28 empty_pcollection = empty_pcollection | "checkFileExist" >> beam.Map(check_file_exists)
29
30 file_path = empty_pcollection | "DeleteAndReturnFilePath" >> beam.Map(
31 delete_and_return_file_path,
32 table_preprocessing=self.table_name_preprocessing,
33 table_processing=self.table_name_processing,
34 column_name_preprocessing=Table.date_file,
35 column_name_processing=Table.execution_date,
36 date_file=self.date_file,
37 is_processing=self.is_processing,
38 )
39 return file_path
40
41 def write_processing(self, processed_data):
42 if self.debug_mode:
43 processed_data | "Printing before WriteToBigQuery Processing" >> beam.Map(print)
44 else:
45 processed_data | "WriteProcessing" >> WriteToBigQuery(
46 table=self.table_name_processing,
47 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
48 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
49 )
50
51 def write_preprocessing(self, preprocessed_data):
52 if self.debug_mode:
53 preprocessed_data | "Printing before WriteToBigQuery Preprocessing" >> beam.Map(print)
54 else:
55 preprocessed_data | "WritePreprocessing" >> WriteToBigQuery(
56 table=self.table_name_preprocessing,
57 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
58 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
59 )
60
61 @staticmethod
62 def add_technical_fields_processing(
63 element_processing: Dict[str, Any],
64 element_preprocessing: Dict[str, Any],
65 ) -> Dict[str, Any]:
66 element_processing[Table.processing_ts] = datetime.utcnow()
67 element_processing[Table.preprocessing_ts] = element_preprocessing[Table.preprocessing_ts].isoformat()
68 element_processing[Table.source_files] = element_preprocessing.pop(Table.source_file)
69 element_processing[Table.execution_date] = element_preprocessing.pop(Table.date_file)
70 element_processing[Table.retailer] = element_preprocessing[Table.retailer]
71 return element_processing
72
73 def add_technical_fields_preprocessing(
74 self,
75 element: Dict[str, Any],
76 source_file: str,
77 date_file: str,
78 retailer_name: Any,
79 ) -> Dict[str, Any]:
80
81 element[Table.preprocessing_ts] = datetime.utcnow()
82 element[Table.source_file] = source_file
83 element[Table.date_file] = date_file
84 if isinstance(retailer_name, dict):
85 element[Table.retailer] = next(
86 (value for key, value in retailer_name.items() if element[ENSEIGNE] == key),
87 "null",
88 )
89 else:
90 element[Table.retailer] = retailer_name
91 return element
92
93 @staticmethod
94 def processing_data(element):
95 return element
96
97 def apply_processing_data(self, preprocessed_data, pipeline):
98 return preprocessed_data | "processing" >> beam.Map(self.processing_data)
99
100 @staticmethod
101 def preprocessing_data(element: Dict[str, Any], schema_input) -> Dict[str, Any]:
102 return element
103
104 def apply_preprocessing_data(self, reader_data):
105 return (
106 reader_data
107 | "preprocessing" >> beam.Map(self.preprocessing_data, schema_input=self.schema_input)
108 | "TechnicalsFieldsPreprocessing"
109 >> beam.Map(
110 self.add_technical_fields_preprocessing,
111 source_file=self.input_data,
112 date_file=self.date_file,
113 retailer_name=self.retailer_name,
114 )
115 )
116
117 def get_reader(self):
118 pass
119
120 @staticmethod
121 def get_normalized_columns(
122 element,
123 schema_input,
124 individual_fields=[],
125 ):
126 result = {}
127 for k, v in schema_input.items():
128 if NORMALIZED_COL_NAME in v.keys():
129 result[v[NORMALIZED_COL_NAME]] = element[k]
130 for k in individual_fields:
131 result[k] = element[k]
132 return result
133
134 def run(self) -> None:
135 with beam.Pipeline(options=self.options) as pipeline:
136 file_path = self.keep_idempotence(pipeline)
137
138 reader_data = file_path | "ReadAndparse" >> beam.ParDo(self.get_reader())
139
140 preprocessed_data = self.apply_preprocessing_data(reader_data)
141
142 self.write_preprocessing(preprocessed_data)
143
144 if self.is_processing:
145 processed_data = self.apply_processing_data(preprocessed_data, pipeline=pipeline)
146
147 self.write_processing(processed_data)
148

DATA-Code 1-050624-120338

Uploaded by

Copyright:

Available Formats

You might also like

DATA-Code 1-050624-120338

Uploaded by

Document Information

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

DATA-Code 1-050624-120338

Uploaded by

Copyright:

Available Formats

Code 1

You might also like