|
|
import gradio as gr |
|
|
from huggingface_hub import InferenceClient |
|
|
import pandas as pd |
|
|
import json |
|
|
import os |
|
|
import time |
|
|
from datetime import datetime |
|
|
|
|
|
|
|
|
SYSTEM_INSTRUCTIONS = """You are an expert at writing clear and visual descriptions for a business category keyword for a yellow pages or business listing website. Given a category keyword, generate a single, detailed description that defines its key visual elements, location, and context. Do not add artistic or stylistic flair. Ensure that the description is CLIP model ready and not too verbose. |
|
|
|
|
|
Here are some examples of the correct format: |
|
|
|
|
|
Category: "Car Rental For Self Driven" |
|
|
|
|
|
Description: "a car available for self-drive rental, parked at a pickup spot without a chauffeur; looks travel-ready, clean, well-maintained, keys handed over to customer" |
|
|
|
|
|
Category: "Mehandi" |
|
|
|
|
|
Description: "Temporary henna artwork applied on hands and feet using cones; fine brown or maroon floral and paisley patterns, mandalas, and lace-like detailing, commonly seen at weddings and festivals." |
|
|
|
|
|
Category: "Photographer" |
|
|
|
|
|
Description: "a person actively shooting photos or posing with a camera; holding a camera to eye, adjusting lens, or directing a subject during a shoot" |
|
|
|
|
|
Category: "Equipment" |
|
|
|
|
|
Description: "lighting stands, softboxes, strobes, tripods, reflectors, gimbals, battery packs, memory cards arranged as gear kits" |
|
|
|
|
|
--- |
|
|
|
|
|
IMPORTANT: You must respond with ONLY a valid JSON object in this exact format: |
|
|
{"Category": "category name", "Description": "description text"} |
|
|
|
|
|
Do not include any other text, explanations, or markdown formatting. Only output the JSON object.""" |
|
|
|
|
|
|
|
|
def extract_json_from_response(response_text): |
|
|
"""Extract and validate JSON from model response.""" |
|
|
|
|
|
response_text = response_text.strip() |
|
|
|
|
|
|
|
|
if "```json" in response_text: |
|
|
response_text = response_text.split("```json")[1].split("```")[0].strip() |
|
|
elif "```" in response_text: |
|
|
response_text = response_text.split("```")[1].split("```")[0].strip() |
|
|
|
|
|
|
|
|
if "{" in response_text and "}" in response_text: |
|
|
start = response_text.find("{") |
|
|
end = response_text.rfind("}") + 1 |
|
|
response_text = response_text[start:end] |
|
|
|
|
|
|
|
|
parsed = json.loads(response_text) |
|
|
|
|
|
|
|
|
if not isinstance(parsed, dict): |
|
|
raise ValueError("Response is not a JSON object") |
|
|
|
|
|
|
|
|
description = ( |
|
|
parsed.get("Description") or |
|
|
parsed.get("description") or |
|
|
parsed.get("desc") or |
|
|
"" |
|
|
) |
|
|
|
|
|
if not description or len(description.strip()) < 10: |
|
|
raise ValueError("Description is missing or too short") |
|
|
|
|
|
return description.strip() |
|
|
|
|
|
|
|
|
def process_single_category(category, client, max_tokens, temperature, top_p, retry_count=3): |
|
|
"""Process a single category keyword and return the description with retry logic.""" |
|
|
messages = [ |
|
|
{"role": "system", "content": SYSTEM_INSTRUCTIONS}, |
|
|
{"role": "user", "content": f"Category: \"{category}\""} |
|
|
] |
|
|
|
|
|
last_error = None |
|
|
|
|
|
for attempt in range(retry_count): |
|
|
try: |
|
|
|
|
|
if attempt > 0: |
|
|
time.sleep(1) |
|
|
|
|
|
|
|
|
response_text = "" |
|
|
for message in client.chat_completion( |
|
|
messages, |
|
|
max_tokens=max_tokens, |
|
|
stream=True, |
|
|
temperature=temperature, |
|
|
top_p=top_p, |
|
|
): |
|
|
if hasattr(message, 'choices') and len(message.choices) > 0: |
|
|
if hasattr(message.choices[0], 'delta') and hasattr(message.choices[0].delta, 'content'): |
|
|
token = message.choices[0].delta.content |
|
|
if token: |
|
|
response_text += token |
|
|
elif isinstance(message, str): |
|
|
response_text += message |
|
|
|
|
|
|
|
|
if not response_text or len(response_text.strip()) < 5: |
|
|
raise ValueError("Empty or too short response from model") |
|
|
|
|
|
|
|
|
description = extract_json_from_response(response_text) |
|
|
|
|
|
|
|
|
return response_text.strip(), description |
|
|
|
|
|
except json.JSONDecodeError as e: |
|
|
last_error = f"JSON parsing failed (attempt {attempt + 1}/{retry_count}): {str(e)}" |
|
|
|
|
|
if attempt == retry_count - 1 and response_text: |
|
|
|
|
|
if len(response_text.strip()) > 20 and not response_text.startswith("{"): |
|
|
return response_text.strip(), response_text.strip() |
|
|
except Exception as e: |
|
|
last_error = f"Processing failed (attempt {attempt + 1}/{retry_count}): {str(e)}" |
|
|
|
|
|
|
|
|
raise Exception(f"Failed after {retry_count} attempts. Last error: {last_error}") |
|
|
|
|
|
|
|
|
def process_csv_files( |
|
|
files, |
|
|
category_column, |
|
|
max_tokens, |
|
|
temperature, |
|
|
top_p, |
|
|
progress=gr.Progress() |
|
|
): |
|
|
""" |
|
|
Process multiple CSV files and generate descriptions for category keywords. |
|
|
""" |
|
|
if not files or len(files) == 0: |
|
|
return "Please upload at least one CSV file.", None |
|
|
|
|
|
|
|
|
import os |
|
|
hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") |
|
|
|
|
|
if not hf_token: |
|
|
return "β Error: HF_TOKEN not found. Please add your Hugging Face token as a Space Secret.\n\nGo to Space Settings β Secrets β Add 'HF_TOKEN'", None |
|
|
|
|
|
client = InferenceClient(token=hf_token, model="openai/gpt-oss-20b") |
|
|
|
|
|
output_files = [] |
|
|
status_messages = [] |
|
|
|
|
|
for file_idx, file in enumerate(files): |
|
|
try: |
|
|
|
|
|
df = pd.read_csv(file.name) |
|
|
status_messages.append(f"π Processing file {file_idx + 1}/{len(files)}: {os.path.basename(file.name)}") |
|
|
|
|
|
|
|
|
if category_column not in df.columns: |
|
|
status_messages.append(f"β οΈ Warning: Column '{category_column}' not found in {os.path.basename(file.name)}. Available columns: {', '.join(df.columns)}") |
|
|
continue |
|
|
|
|
|
|
|
|
descriptions = [] |
|
|
raw_responses = [] |
|
|
|
|
|
categories = df[category_column].dropna().unique() |
|
|
total_categories = len(categories) |
|
|
|
|
|
for idx, category in enumerate(categories): |
|
|
progress((file_idx * total_categories + idx) / (len(files) * total_categories), |
|
|
desc=f"Processing category {idx + 1}/{total_categories} in file {file_idx + 1}") |
|
|
|
|
|
try: |
|
|
|
|
|
raw_response, description = process_single_category( |
|
|
category, client, max_tokens, temperature, top_p, retry_count=3 |
|
|
) |
|
|
|
|
|
|
|
|
if not description or len(description.strip()) < 10: |
|
|
raise ValueError("Description is too short or empty") |
|
|
|
|
|
descriptions.append({ |
|
|
"Category": category, |
|
|
"Description": description, |
|
|
"Raw_Response": raw_response, |
|
|
"Status": "Success" |
|
|
}) |
|
|
|
|
|
status_messages.append(f"β
Processed: {category}") |
|
|
|
|
|
except Exception as e: |
|
|
error_msg = str(e) |
|
|
status_messages.append(f"β οΈ Error processing '{category}': {error_msg}") |
|
|
|
|
|
descriptions.append({ |
|
|
"Category": category, |
|
|
"Description": f"[FAILED - {error_msg[:100]}]", |
|
|
"Raw_Response": "", |
|
|
"Status": "Failed" |
|
|
}) |
|
|
|
|
|
|
|
|
time.sleep(0.5) |
|
|
|
|
|
|
|
|
output_df = pd.DataFrame(descriptions) |
|
|
|
|
|
|
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
|
base_name = os.path.splitext(os.path.basename(file.name))[0] |
|
|
output_filename = f"output_{base_name}_{timestamp}.csv" |
|
|
output_df.to_csv(output_filename, index=False) |
|
|
output_files.append(output_filename) |
|
|
|
|
|
|
|
|
success_count = len([d for d in descriptions if d.get("Status") == "Success"]) |
|
|
failed_count = len([d for d in descriptions if d.get("Status") == "Failed"]) |
|
|
|
|
|
status_messages.append(f"β
Completed: {success_count} succeeded, {failed_count} failed out of {len(descriptions)} categories from {os.path.basename(file.name)}") |
|
|
|
|
|
except Exception as e: |
|
|
status_messages.append(f"β Error processing {os.path.basename(file.name)}: {str(e)}") |
|
|
|
|
|
status_text = "\n".join(status_messages) |
|
|
|
|
|
if output_files: |
|
|
return status_text, output_files |
|
|
else: |
|
|
return status_text + "\n\nβ No output files generated.", None |
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(title="Business Category Description Generator") as demo: |
|
|
gr.Markdown(""" |
|
|
# π’ Business Category Description Generator |
|
|
|
|
|
Upload CSV files containing business category keywords, and this app will generate |
|
|
CLIP-ready visual descriptions for each category using AI. |
|
|
|
|
|
**Instructions:** |
|
|
1. Upload one or more CSV files |
|
|
2. Specify the column name that contains the category keywords |
|
|
3. Adjust model settings (lower temperature = more consistent output) |
|
|
4. Click "Process Files" to generate descriptions |
|
|
5. Download the output CSV files with Status column |
|
|
|
|
|
**Features:** |
|
|
- β
Automatic retry logic (3 attempts per category) |
|
|
- β
JSON validation and error recovery |
|
|
- β
Progress tracking with detailed status |
|
|
- β
Success/failure reporting |
|
|
|
|
|
*Note: For faster processing, use Zero GPU (see Space Settings). Authentication via HF_TOKEN secret.* |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
gr.Markdown("### βοΈ Model Settings") |
|
|
max_tokens = gr.Slider( |
|
|
minimum=64, |
|
|
maximum=512, |
|
|
value=256, |
|
|
step=16, |
|
|
label="Max Tokens" |
|
|
) |
|
|
temperature = gr.Slider( |
|
|
minimum=0.1, |
|
|
maximum=1.0, |
|
|
value=0.3, |
|
|
step=0.1, |
|
|
label="Temperature", |
|
|
info="Lower = more consistent output" |
|
|
) |
|
|
top_p = gr.Slider( |
|
|
minimum=0.1, |
|
|
maximum=1.0, |
|
|
value=0.9, |
|
|
step=0.05, |
|
|
label="Top-p" |
|
|
) |
|
|
|
|
|
with gr.Column(scale=2): |
|
|
files_input = gr.File( |
|
|
label="π€ Upload CSV Files", |
|
|
file_count="multiple", |
|
|
file_types=[".csv"] |
|
|
) |
|
|
category_column = gr.Textbox( |
|
|
label="π Category Column Name", |
|
|
value="category", |
|
|
placeholder="Enter the name of the column containing categories" |
|
|
) |
|
|
process_btn = gr.Button("π Process Files", variant="primary", size="lg") |
|
|
|
|
|
status_output = gr.Textbox( |
|
|
label="π Status", |
|
|
lines=10, |
|
|
interactive=False |
|
|
) |
|
|
files_output = gr.File( |
|
|
label="πΎ Download Output Files", |
|
|
file_count="multiple" |
|
|
) |
|
|
|
|
|
process_btn.click( |
|
|
fn=process_csv_files, |
|
|
inputs=[ |
|
|
files_input, |
|
|
category_column, |
|
|
max_tokens, |
|
|
temperature, |
|
|
top_p |
|
|
], |
|
|
outputs=[status_output, files_output] |
|
|
) |
|
|
|
|
|
gr.Markdown(""" |
|
|
--- |
|
|
### π Output Format |
|
|
Each output CSV file will contain: |
|
|
- **Category**: The original category keyword |
|
|
- **Description**: The generated visual description (validated and cleaned) |
|
|
- **Raw_Response**: The complete model response (for debugging) |
|
|
- **Status**: Success or Failed (with error details) |
|
|
|
|
|
π‘ **Tips for Best Results:** |
|
|
- Use Temperature 0.2-0.4 for consistent, focused descriptions |
|
|
- Use Temperature 0.6-0.8 for more creative variations |
|
|
- Failed categories are marked clearly - you can reprocess them separately |
|
|
- Zero GPU acceleration: Add @spaces.GPU decorator or enable in Space Settings |
|
|
""") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |
|
|
|