import pandas as pd
import numpy as np
import os
from data_generator.utils import load_configs

os.chdir('..')

# Загрузка базовых конфигов
base_cfg = load_configs("./config/base.yaml")

# Пути к файлам
data_paths = base_cfg["data_paths"]

# Функция проверки на NaN в любом месте датафрейма
def check_for_nans(df: pd.DataFrame):
    rows_isna = df.isna().any(axis=1)
    
    if rows_isna.any():
        print("Following rows have missing values!")
        return df[rows_isna]
    else:
        print("No missing values")

# функция проверки датафрейма после изменений
# df_name название датафрейма

def check_df_integrity(initial_df_shape: tuple[int, int], current_df: pd.DataFrame, df_name: str, axis: int | None = None):

    if not isinstance(df_name, str):
        raise TypeError(f'df_name must be a string (name of a DataFrame), but got {type(df_name)}')
        
    current_shape = current_df.shape
    axis_names = {0:'rows', 1:'columns'}
    
    
    if axis != None:
        initial_ax = initial_df_shape[axis]
        current_ax = current_shape[axis]
        other_axis = 1 - axis
        
    
        if initial_ax != current_ax:
            raise ValueError(f'''Something is wrong with {axis_names[axis]}!
            initial {axis_names[axis]} shape is: {initial_ax}
            current {axis_names[axis]} shape is: {current_ax}''')
            
        else:
            initial_other = initial_df_shape[other_axis]
            current_other = current_shape[other_axis]
            
            if current_other != initial_other:
                print(f'''{axis_names[axis]} count is equal.
But changes were made to {axis_names[other_axis]}
initial {axis_names[other_axis]} shape is: {initial_df_shape[other_axis]}
current {axis_names[other_axis]} shape is: {current_shape[other_axis]}''')
            else:
                print(f"{df_name} is intact!")

    else:
        if initial_df_shape != current_shape:
           raise ValueError(f'''Something is wrong!
           initial shape is: {initial_df_shape}
           current shape is: {current_shape}''')
        else:    
            print(f"{df_name} is intact!")

client_df = pd.read_csv(data_paths["raw"]["clients"], sep=';')

client_df.head(3)

client_df.dtypes

client_id       int64
birth_number    int64
district_id     int64
dtype: object

# проверка на NaN в любой колонке
check_for_nans(client_df)

No missing values

client_initial_shape = client_df.shape
client_initial_shape

(5369, 3)

# функция определения пола по дате и возврата обычной даты для женщин, но всё еще в виде
# целого числа

def check_birth_num(birth_num):
    if birth_num // 100 % 100 > 50:
        return birth_num - 5000, 'female'
    elif birth_num // 100 % 100 <= 12:
        return birth_num, 'male'
    else:
        return 'invalid_date', 'unknown'

client_df['birth_date'], client_df['sex'] = zip(*client_df.birth_number.map(check_birth_num))

def parse_int_date(date: int):
    """
    Преобразовать дату в виде целого числа в pd.Timestamp.
    """
    year = date // 10_000
    month = date % 10_000 // 100
    day = date % 10_000 % 100

    full_year = 1900 + year if year > 9 else 2000 + year
    return pd.to_datetime(f'{full_year}-{month}-{day}')

client_df['birth_date'] = client_df['birth_date'].map(parse_int_date)

client_df.head()

# Проверка что нет дат с 50+MM и полом "не женщина"
# filtered_df это датафрейм где первые две цифры birth_number отличаются если отнять 5000 и пол "female"
# то есть по ошибке мужчину отметили как женщину

filtered_df =  client_df[((client_df.birth_number - 5000) // 10000 != client_df.birth_number // 10000) \
          & (client_df.sex == 'female')]

if not filtered_df.empty:
    raise ValueError(f'''Wrong rows in client_df!
{filtered_df}''')
else:
    print('client_df is correct.')

client_df is correct.

client_df.dtypes

client_id                int64
birth_number             int64
district_id              int64
birth_date      datetime64[ns]
sex                     object
dtype: object

client_df = client_df.drop(columns='birth_number')
client_df.head(2)

# Проверка целостности client_df по строкам после изменений
check_df_integrity(client_initial_shape, client_df, 'client_df', 0)

rows count is equal.
But changes were made to columns
initial columns shape is: 3
current columns shape is: 4

client_df.to_csv(data_paths["cleaned"]["clients"], index=False)

district_df = pd.read_csv(data_paths["raw"]["district"], sep=';')

district_initial_shape = district_df.shape
district_initial_shape

(77, 16)

district_df.head(3)

# строка со значениями колонок по порядку

district_col_raw = ''' district code |
| district name |
| region |
| population |
| no_of_mun_below_500     |
| no of mun_between_500_1999  |
| no of mun_between_2000_9999 |
| no of mun_above_10000    |
| no. of cities |
| ratio of urban population |
| avg_salary |
| unemployment rate '95 |
| unemployment rate '96 |
| enterpreneurs per_1000 |
| crimes num '95 |
| crimes num '96 '''

# разбиваем строку по символу |
district_col_split = district_col_raw.split('|')

district_col_split

[' district code ',
 '\n',
 ' district name ',
 '\n',
 ' region ',
 '\n',
 ' population ',
 '\n',
 ' no_of_mun_below_500     ',
 '\n',
 ' no of mun_between_500_1999  ',
 '\n',
 ' no of mun_between_2000_9999 ',
 '\n',
 ' no of mun_above_10000    ',
 '\n',
 ' no. of cities ',
 '\n',
 ' ratio of urban population ',
 '\n',
 ' avg_salary ',
 '\n',
 " unemployment rate '95 ",
 '\n',
 " unemployment rate '96 ",
 '\n',
 ' enterpreneurs per_1000 ',
 '\n',
 " crimes num '95 ",
 '\n',
 " crimes num '96 "]

# убираем пробелы с начала и конца каждой строки
# меняем пробелы на _
# удаляем апострофы
# удаляем точки

dist_col_series = pd.Series(district_col_split).str.strip() \
.str.replace(' ','_') \
.str.replace('\'', '') \
.str.replace('.', '')

# Фильтруем серию, чтобы избавиться от пустых строк. Сбрасываем индекс
dist_col_clean = dist_col_series[dist_col_series != ''].reset_index(drop=True)
dist_col_clean

0                   district_code
1                   district_name
2                          region
3                      population
4             no_of_mun_below_500
5      no_of_mun_between_500_1999
6     no_of_mun_between_2000_9999
7           no_of_mun_above_10000
8                    no_of_cities
9       ratio_of_urban_population
10                     avg_salary
11           unemployment_rate_95
12           unemployment_rate_96
13         enterpreneurs_per_1000
14                  crimes_num_95
15                  crimes_num_96
dtype: object

# назначаем district_df очищенные названия колонок

district_df.columns = dist_col_clean
district_df.head(3)

# Оставляем только нужные колонки

district_df = district_df.loc[:, ['district_code', 'district_name', 'population']]
district_df.head(2)

district_df.isna().sum()

district_code    0
district_name    0
population       0
dtype: int64

district_df.to_csv(data_paths["cleaned"]["districts"])

item	meaning	remark
client_id	record identifier
birth number	identification of client	the number is in the form YYMMDD for men, the number is in the form YYMM+50DD for women, where YYMMDD is the date of birth
district_id	address of the client

	client_id	birth_number	district_id
0	1	706213	18
1	2	450204	1
2	3	406009	1

	client_id	birth_number	district_id	birth_date	sex
0	1	706213	18	1970-12-13	female
1	2	450204	1	1945-02-04	male
2	3	406009	1	1940-10-09	female
3	4	561201	5	1956-12-01	male
4	5	605703	5	1960-07-03	female

item	meaning	remark
A1 = district_id	district code
A2	district name
A3	region
A4	no. of inhabitants
A5	no. of municipalities with inhabitants < 499
A6	no. of municipalities with inhabitants 500-1999
A7	no. of municipalities with inhabitants 2000-9999
A8	no. of municipalities with inhabitants >10000
A9	no. of cities
A10	ratio of urban inhabitants
A11	average salary
A12	unemploymant rate '95
A13	unemploymant rate '96
A14	no. of enterpreneurs per 1000 inhabitants
A15	no. of commited crimes '95
A16	no. of commited crimes '96

	A1	A2	A3	A4	A5	A6	A7	A8	A9	A10	A11	A12	A13	A14	A15	A16
0	1	Hl.m. Praha	Prague	1204953	0	0	0	1	1	100.0	12541	0.29	0.43	167	85677	99107
1	2	Benesov	central Bohemia	88884	80	26	6	2	5	46.7	8507	1.67	1.85	132	2159	2674
2	3	Beroun	central Bohemia	75232	55	26	4	1	5	41.7	8980	1.95	2.21	111	2824	2813

Чистка и изменение таблиц `clients` и `districts`¶

Таблица client¶

Таблица Demographic data (District)¶

Чистка и изменение таблиц clients и districts¶

Таблица client¶

Таблица Demographic data (District)¶

Чистка и изменение таблиц `clients` и `districts`¶