turing-space / turing /CLI_runner /run_dataset.py
papri-ka's picture
Deploy FastAPI ML service to Hugging Face Spaces
5fc6e5d
import os
from pathlib import Path
import sys
from loguru import logger
import typer
from typing_extensions import Annotated
try:
from turing.config import INTERIM_DATA_DIR, RAW_DATA_DIR
from turing.dataset import DatasetManager
except ImportError:
logger.error("Error: Could not import DatasetManager. Check sys.path configuration.")
logger.error(f"Current sys.path: {sys.path}")
sys.exit(1)
script_dir = os.path.dirname(os.path.abspath(__file__))
proj_root = os.path.dirname(os.path.dirname(script_dir))
sys.path.append(proj_root)
app = typer.Typer(help="CLI for dataset management (Download, Conversion, and Search).")
@app.command()
def download():
"""
Loads the dataset from Hugging Face and saves it into the "raw" folder.
"""
logger.info("Starting dataset download...")
manager = DatasetManager()
manager.download_dataset()
logger.success("Download complete.")
@app.command(name="parquet-to-csv")
def parquet_to_csv():
"""
Converts all parquet files in the raw data directory
to CSV format in the interim data directory.
"""
logger.info("Starting Parquet -> CSV conversion...")
manager = DatasetManager()
manager.parquet_to_csv()
logger.success("Conversion complete.")
@app.command()
def search(
filename: Annotated[
str, typer.Argument(help="The exact filename to search for (e.g., 'java_train.parquet')")
],
directory: Annotated[
str,
typer.Option(
"--directory",
"-d",
help="Directory to search in. Keywords 'raw' or 'interim' can be used.",
),
] = "raw",
):
"""
Searches for a file by name in the data directories.
"""
logger.info(f"Initializing search for '{filename}'...")
manager = DatasetManager()
search_path = None
if directory.lower() == "raw":
search_path = RAW_DATA_DIR
logger.info("Searching in 'raw' data directory.")
elif directory.lower() == "interim":
search_path = INTERIM_DATA_DIR
logger.info("Searching in 'interim' data directory.")
else:
search_path = Path(directory)
logger.info(f"Searching in custom path: {search_path}")
results = manager.search_file(filename, search_directory=search_path)
if results:
logger.success(f"Found {len(results)} file(s):")
for res in results:
print(f"-> {res}")
else:
logger.warning(f"File '{filename}' not found in {search_path}.")
@app.command(name="show-raw-hf")
def show_raw_hf():
"""
Loads and displays info about the raw dataset from Hugging Face.
"""
logger.info("Loading raw dataset info from Hugging Face...")
manager = DatasetManager()
dataset = manager.get_raw_dataset_from_hf()
if dataset:
logger.info("Dataset info:")
print(dataset)
else:
logger.error("Could not retrieve dataset.")
if __name__ == "__main__":
app()