数据质量与评估
1. 概述
数据质量是AI模型训练成功的关键因素。本章详细介绍了AI UI生成系统中数据质量控制的机制、评估方法和优化策略。
2. 数据质量控制机制
2.1 数据生成质量控制
2.1.1 模板驱动的数据生成
系统使用模板驱动的方式生成训练数据,确保数据的结构一致性:
class UIDataGenerator:
"""UI数据生成器 - 核心代码片段"""
def __init__(self, config_path: str = "config/model_config.yaml"):
"""初始化数据生成器"""
with open(config_path, 'r', encoding='utf-8') as f:
self.config = yaml.safe_load(f)
# 页面模板 - 确保结构一致性
self.page_templates = self._load_page_templates()
# 主题配置 - 确保主题一致性
self.themes = ["obsidian-gold", "silver-white", "minimal"]
# 页面类型 - 确保类型多样性
self.page_types = ["home", "detail", "search", "profile", "publish"]
2.1.2 数据变化生成策略
系统通过组合不同的变化参数生成多样化的数据:
def _generate_variations(self, page_type: str) -> List[Dict[str, Any]]:
"""生成页面变化 - 确保数据多样性"""
variations = []
if page_type == "home":
# 列数变化:1列或2列
for columns in [1, 2]:
# 搜索栏变化:有或无
for has_search in [True, False]:
# 标签变化:有或无
for has_tabs in [True, False]:
variations.append({
"columns": columns,
"has_search": has_search,
"has_tabs": has_tabs
})
elif page_type == "detail":
# 轮播图变化
for has_carousel in [True, False]:
# 价格变化
for has_price in [True, False]:
# 卖家信息变化
for has_seller in [True, False]:
variations.append({
"has_carousel": has_carousel,
"has_price": has_price,
"has_seller": has_seller
})
return variations
2.2 数据一致性保证
2.2.1 Prompt与DSL的一致性
系统确保生成的Prompt描述与对应的DSL结构完全匹配:
def _generate_prompt(self, page_type: str, theme: str, variations: Dict[str, Any]) -> str:
"""生成中文Prompt - 确保与DSL一致"""
template = self.page_templates[page_type]
base_desc = template["description"]
# 主题描述映射
theme_desc = {
"obsidian-gold": "黑金风格",
"silver-white": "白银风格",
"minimal": "简约风格"
}[theme]
# 构建Prompt
prompt_parts = [theme_desc, base_desc]
# 根据变化添加具体描述
if page_type == "home":
if variations.get("columns", 2) == 1:
prompt_parts.append("单列商品展示")
else:
prompt_parts.append(f"{variations.get('columns', 2)}列商品卡片")
if variations.get("has_search"):
prompt_parts.append("顶部搜索栏")
if variations.get("has_tabs"):
prompt_parts.append("顶部标签切换")
return ",".join(prompt_parts)
2.2.2 DSL结构验证
def _generate_dsl(self, page_type: str, theme: str, variations: Dict[str, Any]) -> Dict[str, Any]:
"""生成UI-DSL - 确保结构正确性"""
template = self.page_templates[page_type]
# 基础DSL结构
dsl = {
"page": {
"name": f"{page_type}_page",
"theme": theme,
"layout": {
"grid": 12,
"gutter": 16,
"padding": 16,
"bg": "#0E0E0E" if theme == "obsidian-gold" else "#FFFFFF"
},
"sections": []
}
}
# 根据模板和变化生成sections
for section in template["sections"]:
section_copy = section.copy()
# 应用变化
if section["type"] == "card-list":
section_copy["props"]["columns"] = variations.get("columns", 2)
elif section["type"] == "topbar":
if not variations.get("has_search", True):
if "search" in section_copy["props"].get("actions", []):
section_copy["props"]["actions"].remove("search")
dsl["page"]["sections"].append(section_copy)
return dsl
3. 数据验证规则
3.1 结构验证
3.1.1 必需字段验证
def validate_dsl_structure(dsl_data: Dict) -> bool:
"""验证DSL结构完整性"""
required_fields = ["page"]
page_required_fields = ["name", "theme", "layout", "sections"]
layout_required_fields = ["grid", "gutter", "padding", "bg"]
# 检查顶层结构
for field in required_fields:
if field not in dsl_data:
return False
page = dsl_data["page"]
# 检查页面结构
for field in page_required_fields:
if field not in page:
return False
# 检查布局结构
layout = page["layout"]
for field in layout_required_fields:
if field not in layout:
return False
return True
3.1.2 类型验证
def validate_dsl_types(dsl_data: Dict) -> bool:
"""验证DSL数据类型正确性"""
page = dsl_data["page"]
# 验证基本类型
if not isinstance(page["name"], str) or not page["name"]:
return False
if not isinstance(page["theme"], str) or not page["theme"]:
return False
if not isinstance(page["sections"], list) or not page["sections"]:
return False
# 验证布局数值
layout = page["layout"]
for field in ["grid", "gutter", "padding"]:
if not isinstance(layout[field], int) or layout[field] <= 0:
return False
# 验证颜色格式
if not re.match(r'^#[0-9A-Fa-f]{6}$', layout["bg"]):
return False
return True
3.2 业务逻辑验证
3.2.1 组件类型验证
def validate_component_types(dsl_data: Dict) -> bool:
"""验证组件类型有效性"""
valid_components = {
"topbar", "tabs", "card-list", "carousel", "price",
"seller", "proof", "cta", "tabbar", "user-info",
"menu-list", "form", "filters"
}
sections = dsl_data["page"]["sections"]
for section in sections:
if section.get("type") not in valid_components:
return False
# 验证必需属性
if not validate_component_props(section):
return False
return True
3.2.2 主题验证
def validate_theme(dsl_data: Dict) -> bool:
"""验证主题有效性"""
valid_themes = ["obsidian-gold", "silver-white", "minimal"]
theme = dsl_data["page"]["theme"]
return theme in valid_themes
4. 数据统计分析
4.1 数据集统计信息
系统自动生成数据集统计信息,存储在 dataset_stats.json 中:
{
"total_samples": 1000,
"train_samples": 800,
"val_samples": 100,
"test_samples": 100,
"themes": [
"obsidian-gold",
"silver-white",
"minimal"
],
"page_types": [
"home",
"detail",
"search",
"profile",
"publish"
]
}
4.2 数据分布分析
4.2.1 页面类型分布
def analyze_page_type_distribution(dataset: List[Dict]) -> Dict[str, int]:
"""分析页面类型分布"""
distribution = {}
for sample in dataset:
dsl = json.loads(sample["output_json_minified"])
page_name = dsl["page"]["name"]
page_type = page_name.split("_")[0] # 提取页面类型
distribution[page_type] = distribution.get(page_type, 0) + 1
return distribution
4.2.2 主题分布
def analyze_theme_distribution(dataset: List[Dict]) -> Dict[str, int]:
"""分析主题分布"""
distribution = {}
for sample in dataset:
dsl = json.loads(sample["output_json_minified"])
theme = dsl["page"]["theme"]
distribution[theme] = distribution.get(theme, 0) + 1
return distribution
4.2.3 组件使用频率
def analyze_component_usage(dataset: List[Dict]) -> Dict[str, int]:
"""分析组件使用频率"""
component_count = {}
for sample in dataset:
dsl = json.loads(sample["output_json_minified"])
sections = dsl["page"]["sections"]
for section in sections:
component_type = section["type"]
component_count[component_type] = component_count.get(component_type, 0) + 1
return component_count
4.3 数据质量指标
4.3.1 完整性指标
def calculate_completeness_score(dataset: List[Dict]) -> float:
"""计算数据完整性得分"""
total_samples = len(dataset)
valid_samples = 0
for sample in dataset:
try:
dsl = json.loads(sample["output_json_minified"])
if validate_dsl_structure(dsl) and validate_dsl_types(dsl):
valid_samples += 1
except:
continue
return valid_samples / total_samples if total_samples > 0 else 0
4.3.2 多样性指标
def calculate_diversity_score(dataset: List[Dict]) -> Dict[str, float]:
"""计算数据多样性得分"""
page_types = set()
themes = set()
components = set()
for sample in dataset:
try:
dsl = json.loads(sample["output_json_minified"])
page_name = dsl["page"]["name"]
page_type = page_name.split("_")[0]
page_types.add(page_type)
themes.add(dsl["page"]["theme"])
for section in dsl["page"]["sections"]:
components.add(section["type"])
except:
continue
return {
"page_type_diversity": len(page_types) / 5, # 5种页面类型
"theme_diversity": len(themes) / 3, # 3种主题
"component_diversity": len(components) / 13 # 13种组件
}
5. 数据平衡策略
5.1 样本数量平衡
def balance_dataset_by_page_type(dataset: List[Dict], target_samples_per_type: int = 200) -> List[Dict]:
"""按页面类型平衡数据集"""
balanced_dataset = []
page_type_counts = {}
for sample in dataset:
try:
dsl = json.loads(sample["output_json_minified"])
page_name = dsl["page"]["name"]
page_type = page_name.split("_")[0]
current_count = page_type_counts.get(page_type, 0)
if current_count < target_samples_per_type:
balanced_dataset.append(sample)
page_type_counts[page_type] = current_count + 1
except:
continue
return balanced_dataset
5.2 主题分布平衡
def balance_dataset_by_theme(dataset: List[Dict], target_samples_per_theme: int = 333) -> List[Dict]:
"""按主题平衡数据集"""
balanced_dataset = []
theme_counts = {}
for sample in dataset:
try:
dsl = json.loads(sample["output_json_minified"])
theme = dsl["page"]["theme"]
current_count = theme_counts.get(theme, 0)
if current_count < target_samples_per_theme:
balanced_dataset.append(sample)
theme_counts[theme] = current_count + 1
except:
continue
return balanced_dataset
6. 数据增强技术
6.1 文本增强
def augment_prompt_text(prompt: str) -> List[str]:
"""增强Prompt文本"""
augmented_prompts = [prompt]
# 同义词替换
synonyms = {
"黑金风格": ["深色主题", "暗色风格"],
"白银风格": ["浅色主题", "明亮风格"],
"简约风格": ["极简风格", "简洁设计"]
}
for original, replacements in synonyms.items():
if original in prompt:
for replacement in replacements:
new_prompt = prompt.replace(original, replacement)
augmented_prompts.append(new_prompt)
return augmented_prompts
6.2 结构增强
def augment_dsl_structure(dsl: Dict) -> List[Dict]:
"""增强DSL结构"""
augmented_dsls = [dsl]
# 调整列数
sections = dsl["page"]["sections"]
for section in sections:
if section["type"] == "card-list":
original_columns = section["props"]["columns"]
if original_columns == 2:
new_dsl = copy.deepcopy(dsl)
new_dsl["page"]["sections"][sections.index(section)]["props"]["columns"] = 1
augmented_dsls.append(new_dsl)
return augmented_dsls
7. 数据质量监控
7.1 实时质量检查
class DataQualityMonitor:
"""数据质量监控器"""
def __init__(self):
self.quality_metrics = {
"completeness": 0.0,
"diversity": 0.0,
"consistency": 0.0
}
def check_sample_quality(self, sample: Dict) -> Dict[str, bool]:
"""检查单个样本质量"""
checks = {
"structure_valid": False,
"type_valid": False,
"theme_valid": False,
"component_valid": False
}
try:
dsl = json.loads(sample["output_json_minified"])
checks["structure_valid"] = validate_dsl_structure(dsl)
checks["type_valid"] = validate_dsl_types(dsl)
checks["theme_valid"] = validate_theme(dsl)
checks["component_valid"] = validate_component_types(dsl)
except:
pass
return checks
def update_quality_metrics(self, dataset: List[Dict]):
"""更新质量指标"""
self.quality_metrics["completeness"] = calculate_completeness_score(dataset)
self.quality_metrics["diversity"] = sum(calculate_diversity_score(dataset).values()) / 3
self.quality_metrics["consistency"] = self._calculate_consistency_score(dataset)
def _calculate_consistency_score(self, dataset: List[Dict]) -> float:
"""计算一致性得分"""
# 实现一致性检查逻辑
return 0.95 # 示例值
7.2 质量报告生成
def generate_quality_report(dataset: List[Dict]) -> Dict:
"""生成数据质量报告"""
monitor = DataQualityMonitor()
monitor.update_quality_metrics(dataset)
report = {
"dataset_size": len(dataset),
"quality_metrics": monitor.quality_metrics,
"distribution": {
"page_types": analyze_page_type_distribution(dataset),
"themes": analyze_theme_distribution(dataset),
"components": analyze_component_usage(dataset)
},
"recommendations": generate_quality_recommendations(monitor.quality_metrics)
}
return report
def generate_quality_recommendations(metrics: Dict[str, float]) -> List[str]:
"""生成质量改进建议"""
recommendations = []
if metrics["completeness"] < 0.95:
recommendations.append("数据完整性不足,建议检查数据生成逻辑")
if metrics["diversity"] < 0.8:
recommendations.append("数据多样性不足,建议增加更多变化")
if metrics["consistency"] < 0.9:
recommendations.append("数据一致性不足,建议检查模板配置")
return recommendations
8. 数据预处理流程
8.1 数据清洗
def clean_dataset(dataset: List[Dict]) -> List[Dict]:
"""清洗数据集"""
cleaned_dataset = []
for sample in dataset:
# 检查必需字段
if "instruction" not in sample or "output_json_minified" not in sample:
continue
# 验证JSON格式
try:
json.loads(sample["output_json_minified"])
except:
continue
# 检查文本长度
if len(sample["instruction"]) < 10 or len(sample["instruction"]) > 200:
continue
cleaned_dataset.append(sample)
return cleaned_dataset
8.2 数据标准化
def standardize_dataset(dataset: List[Dict]) -> List[Dict]:
"""标准化数据集"""
standardized_dataset = []
for sample in dataset:
# 标准化Prompt格式
sample["instruction"] = sample["instruction"].strip()
# 标准化JSON格式
try:
dsl = json.loads(sample["output_json_minified"])
sample["output_json_minified"] = json.dumps(dsl, ensure_ascii=False, separators=(',', ':'))
except:
continue
standardized_dataset.append(sample)
return standardized_dataset
9. 数据质量最佳实践
9.1 数据生成最佳实践
- 使用模板驱动:确保数据结构的一致性
- 控制变化范围:避免过度变化导致的数据噪声
- 验证生成结果:每个生成的样本都要经过验证
- 保持平衡:确保各类数据的平衡分布
9.2 数据验证最佳实践
- 多层验证:结构验证、类型验证、业务逻辑验证
- 自动化检查:集成到数据生成流程中
- 详细日志:记录所有验证失败的原因
- 定期审查:定期检查数据质量指标
9.3 数据维护最佳实践
- 版本控制:对数据集进行版本管理
- 增量更新:支持数据集的增量更新
- 备份策略:定期备份高质量数据集
- 监控告警:设置质量指标告警阈值
10. 工具和脚本
10.1 数据质量检查脚本
#!/usr/bin/env python3
"""
数据质量检查脚本
使用方法: python check_data_quality.py --dataset data/synthetic/train.jsonl
"""
import json
import argparse
from pathlib import Path
def main():
parser = argparse.ArgumentParser(description="检查数据质量")
parser.add_argument("--dataset", required=True, help="数据集文件路径")
parser.add_argument("--output", help="质量报告输出路径")
args = parser.parse_args()
# 加载数据集
dataset = []
with open(args.dataset, 'r', encoding='utf-8') as f:
for line in f:
dataset.append(json.loads(line.strip()))
# 生成质量报告
report = generate_quality_report(dataset)
# 输出报告
if args.output:
with open(args.output, 'w', encoding='utf-8') as f:
json.dump(report, f, ensure_ascii=False, indent=2)
else:
print(json.dumps(report, ensure_ascii=False, indent=2))
if __name__ == "__main__":
main()
10.2 数据平衡脚本
#!/usr/bin/env python3
"""
数据平衡脚本
使用方法: python balance_dataset.py --input data/synthetic/train.jsonl --output data/synthetic/train_balanced.jsonl
"""
import json
import argparse
def main():
parser = argparse.ArgumentParser(description="平衡数据集")
parser.add_argument("--input", required=True, help="输入数据集路径")
parser.add_argument("--output", required=True, help="输出数据集路径")
parser.add_argument("--samples_per_type", type=int, default=200, help="每种页面类型的样本数")
args = parser.parse_args()
# 加载数据集
dataset = []
with open(args.input, 'r', encoding='utf-8') as f:
for line in f:
dataset.append(json.loads(line.strip()))
# 平衡数据集
balanced_dataset = balance_dataset_by_page_type(dataset, args.samples_per_type)
# 保存平衡后的数据集
with open(args.output, 'w', encoding='utf-8') as f:
for sample in balanced_dataset:
f.write(json.dumps(sample, ensure_ascii=False) + '\n')
print(f"数据集已平衡,从 {len(dataset)} 个样本减少到 {len(balanced_dataset)} 个样本")
if __name__ == "__main__":
main()
通过实施这些数据质量控制机制,可以确保AI UI生成系统使用高质量的训练数据,从而提高模型的性能和稳定性。数据质量是AI系统成功的基础,需要在整个数据生命周期中持续关注和改进。