Files
2026-04-12 01:06:31 +07:00

216 lines
7.9 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
CIP Design Core - BM25 search engine for Corporate Identity Program design guidelines
"""
import csv
import re
from pathlib import Path
from math import log
from collections import defaultdict
# ============ CONFIGURATION ============
DATA_DIR = Path(__file__).parent.parent.parent / "data" / "cip"
MAX_RESULTS = 3
CSV_CONFIG = {
"deliverable": {
"file": "deliverables.csv",
"search_cols": ["Deliverable", "Category", "Keywords", "Description", "Mockup Context"],
"output_cols": ["Deliverable", "Category", "Keywords", "Description", "Dimensions", "File Format", "Logo Placement", "Color Usage", "Typography Notes", "Mockup Context", "Best Practices", "Avoid"]
},
"style": {
"file": "styles.csv",
"search_cols": ["Style Name", "Category", "Keywords", "Description", "Mood"],
"output_cols": ["Style Name", "Category", "Keywords", "Description", "Primary Colors", "Secondary Colors", "Typography", "Materials", "Finishes", "Mood", "Best For", "Avoid For"]
},
"industry": {
"file": "industries.csv",
"search_cols": ["Industry", "Keywords", "CIP Style", "Mood"],
"output_cols": ["Industry", "Keywords", "CIP Style", "Primary Colors", "Secondary Colors", "Typography", "Key Deliverables", "Mood", "Best Practices", "Avoid"]
},
"mockup": {
"file": "mockup-contexts.csv",
"search_cols": ["Context Name", "Category", "Keywords", "Scene Description"],
"output_cols": ["Context Name", "Category", "Keywords", "Scene Description", "Lighting", "Environment", "Props", "Camera Angle", "Background", "Style Notes", "Best For", "Prompt Modifiers"]
}
}
# ============ BM25 IMPLEMENTATION ============
class BM25:
"""BM25 ranking algorithm for text search"""
def __init__(self, k1=1.5, b=0.75):
self.k1 = k1
self.b = b
self.corpus = []
self.doc_lengths = []
self.avgdl = 0
self.idf = {}
self.doc_freqs = defaultdict(int)
self.N = 0
def tokenize(self, text):
"""Lowercase, split, remove punctuation, filter short words"""
text = re.sub(r'[^\w\s]', ' ', str(text).lower())
return [w for w in text.split() if len(w) > 2]
def fit(self, documents):
"""Build BM25 index from documents"""
self.corpus = [self.tokenize(doc) for doc in documents]
self.N = len(self.corpus)
if self.N == 0:
return
self.doc_lengths = [len(doc) for doc in self.corpus]
self.avgdl = sum(self.doc_lengths) / self.N
for doc in self.corpus:
seen = set()
for word in doc:
if word not in seen:
self.doc_freqs[word] += 1
seen.add(word)
for word, freq in self.doc_freqs.items():
self.idf[word] = log((self.N - freq + 0.5) / (freq + 0.5) + 1)
def score(self, query):
"""Score all documents against query"""
query_tokens = self.tokenize(query)
scores = []
for idx, doc in enumerate(self.corpus):
score = 0
doc_len = self.doc_lengths[idx]
term_freqs = defaultdict(int)
for word in doc:
term_freqs[word] += 1
for token in query_tokens:
if token in self.idf:
tf = term_freqs[token]
idf = self.idf[token]
numerator = tf * (self.k1 + 1)
denominator = tf + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl)
score += idf * numerator / denominator
scores.append((idx, score))
return sorted(scores, key=lambda x: x[1], reverse=True)
# ============ SEARCH FUNCTIONS ============
def _load_csv(filepath):
"""Load CSV and return list of dicts"""
with open(filepath, 'r', encoding='utf-8') as f:
return list(csv.DictReader(f))
def _search_csv(filepath, search_cols, output_cols, query, max_results):
"""Core search function using BM25"""
if not filepath.exists():
return []
data = _load_csv(filepath)
# Build documents from search columns
documents = [" ".join(str(row.get(col, "")) for col in search_cols) for row in data]
# BM25 search
bm25 = BM25()
bm25.fit(documents)
ranked = bm25.score(query)
# Get top results with score > 0
results = []
for idx, score in ranked[:max_results]:
if score > 0:
row = data[idx]
results.append({col: row.get(col, "") for col in output_cols if col in row})
return results
def detect_domain(query):
"""Auto-detect the most relevant domain from query"""
query_lower = query.lower()
domain_keywords = {
"deliverable": ["card", "letterhead", "envelope", "folder", "shirt", "cap", "badge", "signage", "vehicle", "car", "van", "stationery", "uniform", "merchandise", "packaging", "banner", "booth"],
"style": ["style", "minimal", "modern", "luxury", "vintage", "industrial", "elegant", "bold", "corporate", "organic", "playful"],
"industry": ["tech", "finance", "legal", "healthcare", "hospitality", "food", "fashion", "retail", "construction", "logistics"],
"mockup": ["mockup", "scene", "context", "photo", "shot", "lighting", "background", "studio", "lifestyle"]
}
scores = {domain: sum(1 for kw in keywords if kw in query_lower) for domain, keywords in domain_keywords.items()}
best = max(scores, key=scores.get)
return best if scores[best] > 0 else "deliverable"
def search(query, domain=None, max_results=MAX_RESULTS):
"""Main search function with auto-domain detection"""
if domain is None:
domain = detect_domain(query)
config = CSV_CONFIG.get(domain, CSV_CONFIG["deliverable"])
filepath = DATA_DIR / config["file"]
if not filepath.exists():
return {"error": f"File not found: {filepath}", "domain": domain}
results = _search_csv(filepath, config["search_cols"], config["output_cols"], query, max_results)
return {
"domain": domain,
"query": query,
"file": config["file"],
"count": len(results),
"results": results
}
def search_all(query, max_results=2):
"""Search across all domains and combine results"""
all_results = {}
for domain in CSV_CONFIG.keys():
result = search(query, domain, max_results)
if result.get("results"):
all_results[domain] = result["results"]
return all_results
def get_cip_brief(brand_name, industry_query, style_query=None):
"""Generate a comprehensive CIP brief for a brand"""
# Search industry
industry_results = search(industry_query, "industry", 1)
industry = industry_results.get("results", [{}])[0] if industry_results.get("results") else {}
# Search style (use industry style if not specified)
style_query = style_query or industry.get("CIP Style", "corporate minimal")
style_results = search(style_query, "style", 1)
style = style_results.get("results", [{}])[0] if style_results.get("results") else {}
# Get recommended deliverables for the industry
key_deliverables = industry.get("Key Deliverables", "").split()
deliverable_results = []
for d in key_deliverables[:5]:
result = search(d, "deliverable", 1)
if result.get("results"):
deliverable_results.append(result["results"][0])
return {
"brand_name": brand_name,
"industry": industry,
"style": style,
"recommended_deliverables": deliverable_results,
"color_system": {
"primary": style.get("Primary Colors", industry.get("Primary Colors", "")),
"secondary": style.get("Secondary Colors", industry.get("Secondary Colors", ""))
},
"typography": style.get("Typography", industry.get("Typography", "")),
"materials": style.get("Materials", ""),
"finishes": style.get("Finishes", "")
}