Copy import unittest
import pandas as pd
import json
from unittest.mock import patch, MagicMock
import sys
import os
# Add the scripts directory to the path
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'scripts'))
from generate_schema_mapping import extract_csv_schema, parse_mapping_from_response
class TestSchemaExtraction(unittest.TestCase):
def setUp(self):
"""Set up test fixtures"""
self.sample_csv_data = pd.DataFrame({
'customer_id': [1, 2, 3, 4, 5],
'first_name': ['John', 'Jane', 'Bob', 'Alice', 'Charlie'],
'email': ['[email protected] ', '[email protected] ', '[email protected] ', '[email protected] ', '[email protected] '],
'created_date': ['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04', '2024-01-05'],
'is_active': [True, True, False, True, True]
})
def test_column_type_detection(self):
"""Test correct detection of column data types"""
schema_info = self.extract_schema_from_dataframe(self.sample_csv_data, 'customers')
# Check column count
self.assertEqual(len(schema_info['columns']), 5)
# Check specific column types
columns = {col['name']: col for col in schema_info['columns']}
self.assertEqual(columns['customer_id']['dtype'], 'int64')
self.assertEqual(columns['first_name']['dtype'], 'object')
self.assertEqual(columns['email']['semantic_type'], 'email')
self.assertEqual(columns['is_active']['dtype'], 'bool')
def test_semantic_type_inference(self):
"""Test semantic type inference logic"""
schema_info = self.extract_schema_from_dataframe(self.sample_csv_data, 'customers')
columns = {col['name']: col for col in schema_info['columns']}
self.assertEqual(columns['customer_id']['semantic_type'], 'identifier')
self.assertEqual(columns['email']['semantic_type'], 'email')
self.assertEqual(columns['created_date']['semantic_type'], 'temporal')
def test_null_percentage_calculation(self):
"""Test null percentage calculation"""
# Add some null values
test_data = self.sample_csv_data.copy()
test_data.loc[0, 'first_name'] = None
test_data.loc[1, 'first_name'] = None
schema_info = self.extract_schema_from_dataframe(test_data, 'customers')
columns = {col['name']: col for col in schema_info['columns']}
self.assertEqual(columns['first_name']['null_percentage'], 40.0) # 2/5 * 100
def test_unique_count_calculation(self):
"""Test unique value count calculation"""
schema_info = self.extract_schema_from_dataframe(self.sample_csv_data, 'customers')
columns = {col['name']: col for col in schema_info['columns']}
self.assertEqual(columns['customer_id']['unique_count'], 5)
self.assertEqual(columns['is_active']['unique_count'], 2)
def extract_schema_from_dataframe(self, df, table_name):
"""Helper method to extract schema from DataFrame"""
schema_info = {
'table_name': table_name,
'row_count': len(df),
'columns': []
}
for col in df.columns:
col_info = {
'name': col,
'dtype': str(df[col].dtype),
'null_count': int(df[col].isnull().sum()),
'null_percentage': float(df[col].isnull().sum() / len(df) * 100),
'unique_count': int(df[col].nunique()),
'sample_values': df[col].dropna().head(5).tolist()
}
# Infer semantic type
if 'id' in col.lower():
col_info['semantic_type'] = 'identifier'
elif 'email' in col.lower():
col_info['semantic_type'] = 'email'
elif 'phone' in col.lower():
col_info['semantic_type'] = 'phone'
elif 'date' in col.lower() or 'time' in col.lower():
col_info['semantic_type'] = 'temporal'
elif df[col].dtype in ['float64', 'int64']:
col_info['semantic_type'] = 'numeric'
else:
col_info['semantic_type'] = 'text'
schema_info['columns'].append(col_info)
return schema_info
if __name__ == '__main__':
unittest.main()