[ ]:
from IPython.display import Markdown, display
import inspect
import pathlib
import ast

def display_task_source(task_func, title):
    """Display task source code from the original file, including decorators but excluding docstrings.

    Only shows code from @flowtask decorator to end of function, excluding the function's docstring.
    """
    # Get the module where the function is defined
    module = inspect.getmodule(task_func)
    func_name = task_func.__name__

    if not module or not hasattr(module, '__file__'):
        display(Markdown(f"❌ Could not find source file for `{func_name}`"))
        return

    # Read the entire source file
    source_file = pathlib.Path(module.__file__)
    with open(source_file, 'r', encoding='utf-8') as f:
        file_content = f.read()

    # Parse the AST to find the function
    tree = ast.parse(file_content)

    # Find the function definition in the AST
    for node in ast.walk(tree):
        if isinstance(node, ast.FunctionDef) and node.name == func_name:
            # Get line numbers (1-indexed)
            start_line = node.lineno
            end_line = node.end_lineno

            # Read the lines
            lines = file_content.splitlines()

            # Look backwards from function def to find @flowtask decorator
            decorator_start = start_line - 1  # Convert to 0-indexed
            while decorator_start > 0:
                line = lines[decorator_start - 1].strip()
                if line.startswith('@flowtask'):
                    break
                decorator_start -= 1

            # Extract from decorator to end of function
            source_lines = lines[decorator_start - 1:end_line]

            # Now remove the docstring if present
            # The docstring is the first statement in the function body
            if node.body and isinstance(node.body[0], ast.Expr) and isinstance(node.body[0].value, ast.Constant):
                if isinstance(node.body[0].value.value, str):
                    # Found a docstring - get its line range
                    docstring_start = node.body[0].lineno - 1  # Convert to 0-indexed
                    docstring_end = node.body[0].end_lineno - 1  # Convert to 0-indexed

                    # Calculate relative positions in source_lines array
                    relative_doc_start = docstring_start - (decorator_start - 1)
                    relative_doc_end = docstring_end - (decorator_start - 1)

                    # Remove docstring lines from source_lines
                    source_lines = source_lines[:relative_doc_start] + source_lines[relative_doc_end + 1:]

            source_code = '\n'.join(source_lines)

            # Display as markdown
            markdown_content = f"""

### `{func_name}`

```python
{source_code}
```
"""
            display(Markdown(markdown_content))
            return

    # Fallback if AST parsing fails
    display(Markdown(f"❌ Could not parse function `{func_name}` from source file"))

Fabwave - Part Classification using HOOPS AI

[2]:
import hoops_ai
import os

hoops_ai.set_license(hoops_ai.use_test_license(), validate=False)


ℹ️ Using TEST LICENSE (expires December 8, 2025 - 37 days remaining)
   For production use, obtain your own license from Tech Soft 3D
[3]:
# Using a test flow name for 10-file test
from cad_tasks_fabwave import get_flow_name
flow_name = get_flow_name()
print(f"Flow name: {flow_name}")
Flow name: ETL_Fabwave_training
[4]:
import os
import pathlib
from typing import Tuple, List

# Import the flow builder framework from the library
import hoops_ai
from hoops_ai.flowmanager import flowtask


from hoops_ai.cadaccess import HOOPSLoader, HOOPSTools
from hoops_ai.cadencoder import BrepEncoder
from hoops_ai.dataset import DatasetExplorer
from hoops_ai.storage import DataStorage, CADFileRetriever, LocalStorageProvider
from hoops_ai.storage.datasetstorage.schema_builder import SchemaBuilder

Configuring Setup

[5]:
# Configuration - Using simpler paths
nb_dir = pathlib.Path.cwd()
flows_outputdir = nb_dir.joinpath("out")
# Import task functions from external module for ProcessPoolExecutor compatibility
from cad_tasks_fabwave import gather_fabwave_files, encode_data_for_ml_training, my_workflow_for_fabewave, get_flow_name

ETL Data pipeline

[6]:
# data source
datasources_dir = [str(nb_dir.parent.joinpath("packages","cadfiles","fabwave"))
    #str(nb_dir.parent.joinpath("packages","cadfiles","fabwave", "CAD_1_15_Classes", "Bolts")),
    #str(nb_dir.parent.joinpath("packages","cadfiles","fabwave", "CAD_1_15_Classes", "Bushing_Damping_Liners")),
    #str(nb_dir.parent.joinpath("packages","cadfiles","fabwave", "CAD25-45_TOTAL1000", "Sleeve Washers"))
                  ]
[7]:
display_task_source(gather_fabwave_files, "gather_fabwave_files")

gather_fabwave_files

@flowtask.extract(
    name="gather fabwave files",
    inputs=["cad_datasources"],
    outputs=["cad_dataset"],
    parallel_execution=True
)
def gather_fabwave_files(source: str) -> List[str]:

    # Example 1: Basic retrieval with format filtering
    retriever = CADFileRetriever(
        storage_provider=LocalStorageProvider(directory_path=source),
        formats=[".stp", ".step", ".iges", ".igs"],
        #filter_pattern="*5*"  # Only files with "5" in name
    )

    # Get files using the library's retriever
    source_files = retriever.get_file_list()

    # Shuffle to get random sample instead of first N files in order
    import random
    random.seed(42)  # For reproducibility
    shuffled_files = source_files.copy()
    random.shuffle(shuffled_files)

    return shuffled_files
[8]:
labels_description = {
        0: {"name": "Bearings"              , "description": " fabewave dataset sample  "},
        1: {"name": "Bolts"                 , "description": " fabewave dataset sample  "},
        2: {"name": "Brackets"              , "description": " fabewave dataset sample  "},
        3: {"name": "Bushing"               , "description": " fabewave dataset sample  "},
        4: {"name": "Bushing_Damping_Liners", "description": " fabewave dataset sample  "},
        5: {"name": "Collets"               , "description": " fabewave dataset sample  "},
        6: {"name": "Gasket"                , "description": " fabewave dataset sample  "},
        7: {"name": "Grommets"              , "description": " fabewave dataset sample  "},
        8: {"name": "HeadlessScrews"        , "description": " fabewave dataset sample  "},
        9: {"name": "Hex_Head_Screws"       , "description": " fabewave dataset sample  "},
        10: {"name": "Keyway_Shaft"         , "description": " fabewave dataset sample  "},
        11: {"name": "Machine_Key"          , "description": " fabewave dataset sample  "},
        12: {"name": "Nuts"                 , "description": " fabewave dataset sample  "},
        13: {"name": "O_Rings"              , "description": " fabewave dataset sample  "},
        14: {"name": "Thumb_Screws"        , "description": " fabewave dataset sample   "},
        15: {"name": "Pipe_Fittings"        , "description": " fabewave dataset sample   "},
        16: {"name": "Pipe_Joints"              , "description": " fabewave dataset sample  "},
        17: {"name": "Pipes"                 , "description": " fabewave dataset sample  "},
        18: {"name": "Rollers"              , "description": " fabewave dataset sample  "},
        19: {"name": "Rotary_Shaft"               , "description": " fabewave dataset sample  "},
        20: {"name": "Shaft_Collar"         , "description": " fabewave dataset sample  "},
        21: {"name": "Slotted_Flat_Head_Screws"               , "description": " fabewave dataset sample  "},
        22: {"name": "Socket_Head_Screws"               , "description": " fabewave dataset sample  "},
        23: {"name": "Washers"                , "description": " fabewave dataset sample  "},
        24: {"name": "Boxes"              , "description": " fabewave dataset sample  "},
        25: {"name": "Cotter_Pin"        , "description": " fabewave dataset sample  "},
        26: {"name": "External Retaining Rings"       , "description": " fabewave dataset sample  "},
        27: {"name": "Eyesbolts With Shoulders"         , "description": " fabewave dataset sample  "},
        28: {"name": "Fixed Cap Flange"          , "description": " fabewave dataset sample  "},
        29: {"name": "Gear Rod Stock"                 , "description": " fabewave dataset sample  "},
        30: {"name": "Gears"              , "description": " fabewave dataset sample  "},
        31: {"name": "Holebolts With Shoulders"        , "description": " fabewave dataset sample   "},
        32: {"name": "Idler Sprocket"        , "description": " fabewave dataset sample   "},
        33: {"name": "Miter Gear Set Screw"        , "description": " fabewave dataset sample   "},
        34: {"name": "Miter Gears"        , "description": " fabewave dataset sample   "},
        35: {"name": "Rectangular Gear Rack"        , "description": " fabewave dataset sample   "},
        36: {"name": "Routing EyeBolts Bent Closed Eye"        , "description": " fabewave dataset sample   "},
        37: {"name": "Sleeve Washers"        , "description": " fabewave dataset sample   "},
        38: {"name": "Socket-Connect Flanges"        , "description": " fabewave dataset sample   "},
        39: {"name": "Sprocket Taper-Lock Bushing"        , "description": " fabewave dataset sample   "},
        40: {"name": "Strut Channel Floor Mount"        , "description": " fabewave dataset sample   "},
        41: {"name": "Strut Channel Side-Side"        , "description": " fabewave dataset sample   "},
        42: {"name": "Tag Holder"        , "description": " fabewave dataset sample   "},
        43: {"name": "Webbing Guide"        , "description": " fabewave dataset sample   "},
        44: {"name": "Wide Grip External Retaining Ring"        , "description": " fabewave dataset sample   "},
    }

# Invert the dictionary
description_to_code = {v["name"]: k for k, v in labels_description.items()}

Data Transformation : Encoded data to be used as ml input

[9]:
display_task_source(encode_data_for_ml_training, "encode_data_for_ml_training")

encode_data_for_ml_training

@flowtask.transform(
    name="Preparing data for Exploring and ML training",
    inputs=["cad_dataset"],
    outputs=["cad_files_encoded"],
    parallel_execution=True
)
def encode_data_for_ml_training(cad_file: str, cad_loader :  HOOPSLoader, storage : DataStorage) -> str:
    import numpy as np
    import random

    cad_model = cad_loader.create_from_file(cad_file)
    storage.set_schema(cad_schema)

    facecount, edgecount = my_workflow_for_fabewave.encode_cad_data(cad_file, cad_loader, storage)

    # Add label data
    folder_with_name = str(pathlib.Path(cad_file).parent.parent.stem)
    label_code = description_to_code.get(folder_with_name, None)

    # Validate label_code - skip if unknown category
    if label_code is None:
        raise ValueError(f"Unknown category '{folder_with_name}' for file {cad_file}. Category not found in labels_description.")

    label_description = [{int(label_code) : labels_description[label_code]["name"]} ]

    # Save label data in the schema-defined group for dataset analytics
    storage.save_data("Labels/part_label", np.array([label_code]))
    storage.save_metadata("part_label_description", folder_with_name)

    # ALSO save label using the key expected by GraphClassification.convert_encoded_data_to_graph
    # This is required for the DGL graph files to have the correct labels
    storage.save_data(LabelStorage.GRAPH_CADENTITY, np.array([label_code]))

    #my_workflow_for_fabewave.encode_label_data()
    dgl_storage = DGLGraphStoreHandler()

    # DGL graph Bin file
    item_no_suffix = pathlib.Path(cad_file).with_suffix("")  # Remove the suffix to get the base name
    hash_id = generate_unique_id_from_path(str(item_no_suffix))
    dgl_output_path = pathlib.Path(flows_outputdir).joinpath("flows", flow_name, "dgl", f"{hash_id}.ml")
    dgl_output_path.parent.mkdir(parents=True, exist_ok=True)

    my_workflow_for_fabewave.convert_encoded_data_to_graph(storage, dgl_storage, str(dgl_output_path))

    # Save file-level metadata (will be routed to .infoset)
    storage.save_metadata("Item", str(cad_file))
    storage.save_metadata("source", "FABWAVE")

    # Compress the storage into a .data file
    storage.compress_store()

    # Return the base storage path
    return storage.get_file_path("")

Pipeline execution

[10]:
# Create and run the Data Flow
flow_name = get_flow_name()
cad_flow = hoops_ai.create_flow(
    name=flow_name,
    tasks=[gather_fabwave_files, encode_data_for_ml_training],
    max_workers=50,
    flows_outputdir=str(flows_outputdir),
    ml_task="Part Classification",
    auto_dataset_export=True,  # Enable automatic dataset merging
    export_visualization=True  # Disable visualization export
)

# Run the flow to process all files
print("Starting flow execution with parallel processing...")
flow_output, output_dict, flow_file = cad_flow.process(inputs={'cad_datasources': datasources_dir})

# Display results
print("\n" + "="*70)
print("FLOW EXECUTION COMPLETED SUCCESSFULLY")
print("="*70)
print(f"\nDataset files created:")
print(f"  Main dataset: {output_dict.get('flow_data', 'N/A')}")
print(f"  Info dataset: {output_dict.get('flow_info', 'N/A')}")
print(f"  Attributes: {output_dict.get('flow_attributes', 'N/A')}")
print(f"  Flow file: {flow_file}")
print(f"\nTotal processing time: {output_dict.get('Duration [seconds]', {}).get('total', 0):.2f} seconds")
print(f"Files processed: {output_dict.get('file_count', 0)}")

Starting flow execution with parallel processing...
|INFO| FLOW | ######### Flow 'ETL_Fabwave_training' start #######
|WARNING| FLOW | Cleaning up existing flow directory: C:\Users\LuisSalazar\Documents\MAIN\MLProject\repo\HOOPS-AI-tutorials\notebooks\out\flows\ETL_Fabwave_training
|WARNING| FLOW | Removing all previous outputs for flow 'ETL_Fabwave_training' to avoid build conflicts.
|INFO| FLOW | Flow directory successfully cleaned and recreated: C:\Users\LuisSalazar\Documents\MAIN\MLProject\repo\HOOPS-AI-tutorials\notebooks\out\flows\ETL_Fabwave_training
|INFO| FLOW |
Flow Execution Summary
|INFO| FLOW | ==================================================
|INFO| FLOW | Task 1: gather fabwave files
|INFO| FLOW |     Inputs : cad_datasources
|INFO| FLOW |     Outputs: cad_dataset
|INFO| FLOW | Task 2: Preparing data for Exploring and ML training
|INFO| FLOW |     Inputs : cad_dataset
|INFO| FLOW |     Outputs: cad_files_encoded
|INFO| FLOW | Task 3: AutoDatasetExportTask
|INFO| FLOW |     Inputs : cad_files_encoded
|INFO| FLOW |     Outputs: encoded_dataset, encoded_dataset_info, encoded_dataset_attribs
|INFO| FLOW |
Task Dependencies:
|INFO| FLOW | gather fabwave files has no dependencies.
|INFO| FLOW | gather fabwave files --> Preparing data for Exploring and ML training
|INFO| FLOW | Preparing data for Exploring and ML training --> AutoDatasetExportTask
|INFO| FLOW | ==================================================

|INFO| FLOW | Executing ParallelTask 'gather fabwave files' with 1 items.
|INFO| FLOW | Executing ParallelTask 'Preparing data for Exploring and ML training' with 4572 items.
|WARNING| FLOW | Total number of items with errors: 26 (0.57%)
|WARNING| FLOW | Corrupted items are listed in 'C:\Users\LuisSalazar\Documents\MAIN\MLProject\repo\HOOPS-AI-tutorials\notebooks\out\flows\ETL_Fabwave_training\error_summary.json'.
|INFO| FLOW | Executing SequentialTask 'AutoDatasetExportTask'.
[DatasetMerger] Saved schema with 4 groups to metadata.json
|INFO| FLOW | Auto dataset export completed in 10976.49 seconds
Sequential Task end=====================
|INFO| FLOW | Time taken: 17058.31 seconds
|INFO| FLOW | ######### Flow 'ETL_Fabwave_training' end ######

======================================================================
FLOW EXECUTION COMPLETED SUCCESSFULLY
======================================================================

Dataset files created:
  Main dataset: C:\Users\LuisSalazar\Documents\MAIN\MLProject\repo\HOOPS-AI-tutorials\notebooks\out\flows\ETL_Fabwave_training\ETL_Fabwave_training.dataset
  Info dataset: C:\Users\LuisSalazar\Documents\MAIN\MLProject\repo\HOOPS-AI-tutorials\notebooks\out\flows\ETL_Fabwave_training\ETL_Fabwave_training.infoset
  Attributes: C:\Users\LuisSalazar\Documents\MAIN\MLProject\repo\HOOPS-AI-tutorials\notebooks\out\flows\ETL_Fabwave_training\ETL_Fabwave_training.attribset
  Flow file: C:\Users\LuisSalazar\Documents\MAIN\MLProject\repo\HOOPS-AI-tutorials\notebooks\out/flows/ETL_Fabwave_training/ETL_Fabwave_training.flow

Total processing time: 17058.31 seconds
Files processed: 4572

Data Serving : Analytics

[11]:
# Explore the generated dataset
# flow_file = str(pathlib.Path("c:/Users/LuisSalazar/Documents/MAIN/MLProject/repo/ML-Initiative/notebooks/out/flows/ETL_Fabwave_training/ETL_Fabwave_training.flow"))

explorer = DatasetExplorer(flow_output_file=str(flow_file))
explorer.print_table_of_contents()
[DatasetExplorer] Default local cluster started: <Client: 'tcp://127.0.0.1:58622' processes=1 threads=16, memory=7.45 GiB>

 Dataset Table of Contents

LABELS_GROUP:
  FILE_ID_CODE_LABELS_DATA: Shape: (4546,), Dims: ('Labels_part_label_dim_0',), Size: 4546
  PART_LABEL_DATA: Shape: (4546,), Dims: ('Labels_part_label_dim_0',), Size: 4546

EDGES_GROUP:
  EDGE_CONVEXITIES_DATA: Shape: (337065,), Dims: ('edge',), Size: 337065
  EDGE_DIHEDRAL_ANGLES_DATA: Shape: (337065,), Dims: ('edge',), Size: 337065
  EDGE_INDICES_DATA: Shape: (337065,), Dims: ('edge',), Size: 337065
  EDGE_LENGTHS_DATA: Shape: (337065,), Dims: ('edge',), Size: 337065
  EDGE_TYPES_DATA: Shape: (337065,), Dims: ('edge',), Size: 337065
  EDGE_U_GRIDS_DATA: Shape: (337065, 10, 6), Dims: ('edge', 'u', 'component'), Size: 20223900
  FILE_ID_CODE_EDGES_DATA: Shape: (337065,), Dims: ('edge',), Size: 337065

FACES_GROUP:
  FACE_AREAS_DATA: Shape: (130923,), Dims: ('face',), Size: 130923
  FACE_DISCRETIZATION_DATA: Shape: (130923, 100, 7), Dims: ('face', 'sample', 'component'), Size: 91646100
  FACE_INDICES_DATA: Shape: (130923,), Dims: ('face',), Size: 130923
  FACE_LOOPS_DATA: Shape: (130923,), Dims: ('face',), Size: 130923
  FACE_TYPES_DATA: Shape: (130923,), Dims: ('face',), Size: 130923
  FILE_ID_CODE_FACES_DATA: Shape: (130923,), Dims: ('face',), Size: 130923

GRAPH_GROUP:
  EDGES_DESTINATION_DATA: Shape: (337065,), Dims: ('edge',), Size: 337065
  EDGES_SOURCE_DATA: Shape: (337065,), Dims: ('edge',), Size: 337065
  FILE_ID_CODE_GRAPH_DATA: Shape: (337065,), Dims: ('edge',), Size: 337065
  NUM_NODES_DATA: Shape: (337065,), Dims: ('edge',), Size: 337065
==================================
Columns in file_info:
                                  name    id                             description                        stream_cache_png                         stream_cache_3d subset table_name
0     000757c971d4af379cff2bf219566f76     0  ...25a-d5c6-4a99-a7f1-efc5c122e442.stp  ...00757c971d4af379cff2bf219566f76.png  ...00757c971d4af379cff2bf219566f76.scs    N/A  file_info
1     000d2096f1b75775aee0cca3869062ac     1  ...376-4e32-4a62-8636-833a596c3a24.stp  ...00d2096f1b75775aee0cca3869062ac.png  ...00d2096f1b75775aee0cca3869062ac.scs    N/A  file_info
2     0018d103885051d4463bfbdf97642644     2  ...8a0-14fc-4bcc-94f5-7e206c7ab2eb.stp  ...018d103885051d4463bfbdf97642644.png  ...018d103885051d4463bfbdf97642644.scs    N/A  file_info
3     001ec9bc5e47900ff95873b20a8cf97e     3  ...s\STEP\sshettigarsleevewasher30.stp  ...01ec9bc5e47900ff95873b20a8cf97e.png  ...01ec9bc5e47900ff95873b20a8cf97e.scs    N/A  file_info
4     002b9953b2669f0dbf7e95a97d47e759     4  ...b39-9f03-490e-b962-55ec3761f456.stp  ...02b9953b2669f0dbf7e95a97d47e759.png  ...02b9953b2669f0dbf7e95a97d47e759.scs    N/A  file_info
5     0043ebf2efcc3849c5b61c45f4190369     5  ...aae-e8c6-40df-a29b-5770a2d5a73c.stp  ...043ebf2efcc3849c5b61c45f4190369.png  ...043ebf2efcc3849c5b61c45f4190369.scs    N/A  file_info
6     0045b381a0be8fbc8d49688a6479a076     6  ...ef6-87ef-45b6-8c9c-0414903668e3.stp  ...045b381a0be8fbc8d49688a6479a076.png  ...045b381a0be8fbc8d49688a6479a076.scs    N/A  file_info
7     005479bbb0a5df52f6d9cc2ad0f8253c     7  ...s\STEP\sshettigarsleevewasher44.stp  ...05479bbb0a5df52f6d9cc2ad0f8253c.png  ...05479bbb0a5df52f6d9cc2ad0f8253c.scs    N/A  file_info
8     0054a7cfbadf02dce98531f4dd7508a5     8  ...800-c9bd-4dd6-b425-30bf848fcd41.stp  ...054a7cfbadf02dce98531f4dd7508a5.png  ...054a7cfbadf02dce98531f4dd7508a5.scs    N/A  file_info
9     00a5e9cd26129c2757fe6a08407df0f9     9  ...dbe-97b8-46d4-b550-aa40dab589cd.stp  ...0a5e9cd26129c2757fe6a08407df0f9.png  ...0a5e9cd26129c2757fe6a08407df0f9.scs    N/A  file_info
...                                ...   ...                                     ...                                     ...                                     ...    ...        ...
4536  ff3d884bd5a5e96e491a4ea638d583e0  4536  ...5ae-1468-46f5-bce0-c959e3cbf870.stp  ...f3d884bd5a5e96e491a4ea638d583e0.png  ...f3d884bd5a5e96e491a4ea638d583e0.scs    N/A  file_info
4537  ff3fcef336d538c95a1f04ccf01339f1  4537  ...02-e51d-43d8-bd4b-171c3589485b.step  ...f3fcef336d538c95a1f04ccf01339f1.png  ...f3fcef336d538c95a1f04ccf01339f1.scs    N/A  file_info
4538  ff69cdbe147e5b2b231a48c668268d3b  4538  ...c3a-a37f-416c-8ddb-e0683a1a8926.stp  ...f69cdbe147e5b2b231a48c668268d3b.png  ...f69cdbe147e5b2b231a48c668268d3b.scs    N/A  file_info
4539  ff860352bbd8c7af1b596835829288c1  4539  ...543-8b94-4dee-a8e5-fdc362a80359.stp  ...f860352bbd8c7af1b596835829288c1.png  ...f860352bbd8c7af1b596835829288c1.scs    N/A  file_info
4540  ff94b5f50c009fcc19e231c8dc4fef18  4540  ...lasses\Brackets\STEP\bracket217.stp  ...f94b5f50c009fcc19e231c8dc4fef18.png  ...f94b5f50c009fcc19e231c8dc4fef18.scs    N/A  file_info
4541  ff96da3be8fa50b0c3516392e7c57770  4541  ...cdb-e812-42b6-a29f-5973b8b41042.stp  ...f96da3be8fa50b0c3516392e7c57770.png  ...f96da3be8fa50b0c3516392e7c57770.scs    N/A  file_info
4542  ffb399838ecd3d81f1e05edd13002887  4542  ...106-6aba-43ff-b412-4b97010388b2.stp  ...fb399838ecd3d81f1e05edd13002887.png  ...fb399838ecd3d81f1e05edd13002887.scs    N/A  file_info
4543  ffc34e21ca8c3b0aefc40057489b1db0  4543  ...30-ad62-4695-9e60-317db1586e81.step  ...fc34e21ca8c3b0aefc40057489b1db0.png  ...fc34e21ca8c3b0aefc40057489b1db0.scs    N/A  file_info
4544  ffd6cacefde868ff77c8057d55add378  4544  ...83d-2422-433d-88d1-0193c2c1471c.stp  ...fd6cacefde868ff77c8057d55add378.png  ...fd6cacefde868ff77c8057d55add378.scs    N/A  file_info
4545  ffd8d4beab2e24522adc38308def99ec  4545  ...200-0d37-4719-b72d-b58833bef79a.stp  ...fd8d4beab2e24522adc38308def99ec.png  ...fd8d4beab2e24522adc38308def99ec.scs    N/A  file_info

ML-Ready Dataset Preparation

The DatasetLoader provides tools for preparing the merged dataset for machine learning:

Key Capabilities:

  • Stratified Splitting: Create train/validation/test splits while preserving class distributions

  • Subset Tracking: Records file assignments in the dataset metadata

[12]:
print(explorer.available_groups())
{'graph', 'faces', 'edges', 'Labels'}
[13]:
print(explorer.available_arrays('Labels'))
{'part_label', 'file_id_code_Labels'}
[14]:
# Visualization libraries
import matplotlib.pyplot as plt

def print_distribution_info(dist, title="Distribution"):
    """Helper function to print and visualize distribution data."""
    list_filecount = list()
    for i, bin_files in enumerate(dist['file_id_codes_in_bins']):
        list_filecount.append(bin_files.size)

    dist['file_count'] =list_filecount
    # Visualization with matplotlib
    fig, ax = plt.subplots(figsize=(12, 4))

    bin_centers = 0.5 * (dist['bin_edges'][1:] + dist['bin_edges'][:-1])
    ax.bar(bin_centers, dist['file_count'], width=(dist['bin_edges'][1] - dist['bin_edges'][0]),
           alpha=0.7, color='steelblue', edgecolor='black', linewidth=1)

    # Add file count annotations
    for i, count in enumerate(dist['file_count']):
        if count > 0:  # Only annotate non-empty bins
            ax.text(bin_centers[i], count + 0.5, f"{count}",
                    ha='center', va='bottom', fontsize=8)

    ax.set_xlabel('Value')
    ax.set_ylabel('Count')
    ax.set_title(f'{title} Histogram')
    ax.grid(True, linestyle='--', alpha=0.7)

    plt.tight_layout()
    plt.show()
[15]:
import time
start_time = time.time()
face_dist = explorer.create_distribution(key="part_label", bins=None, group="Labels")
print(f"Material distribution created in {(time.time() - start_time):.2f} seconds\n")
print_distribution_info(face_dist, title="Materials")
Material distribution created in 2.21 seconds

../../../_images/tutorials_hoops_ai_tutorials_notebooks_3a_ETL_pipeline_using_flow_fabewave_21_1.png

Dataset Visualization with DatasetViewer

The DatasetViewer is a powerful visualization tool that bridges dataset queries and visual analysis. It enables you to quickly visualize query results in two ways:

  1. Image Grids: Generate collages of PNG previews for rapid visual scanning

  2. Interactive 3D Views: Open inline 3D viewers for detailed model inspection

[16]:
# Import the DatasetViewer from the insights module
from hoops_ai.insights import DatasetViewer

# Create a DatasetViewer using the convenience method from_explorer
# This method queries the explorer and builds the file ID to visualization path mappings
dataset_viewer = DatasetViewer.from_explorer(explorer)
2025-11-01 01:47:54 | INFO | hoops_ai.insights.dataset_viewer | Built file mapping for 4546 files
[17]:
start_time = time.time()

# condition
material_is_frequent = lambda ds: ds['part_label'] == 23

filelist = explorer.get_file_list(group="Labels", where=material_is_frequent)
print(f"Filtering completed in {(time.time() - start_time):.2f} seconds")
print(len(filelist))
Filtering completed in 0.11 seconds
722

Example 1: Visualize Query Results as Image Grid

Now let’s use the query results we obtained earlier and visualize them as a grid of images. This is perfect for quickly scanning through many files to understand patterns or identify specific cases.

[ ]:
# Visualize the filtered files as a 5x5 grid with file IDs as labels
fig = dataset_viewer.show_preview_as_image(
    filelist,
    k=25,                      # Show up to 25 files
    grid_cols=8,               # 5 columns
    label_format='id',         # Show file IDs as labels
    figsize=(15, 5)           # Larger figure size
)

plt.show()

Machine Learning Training

[ ]:
# Load and split dataset for machine learning
from hoops_ai.dataset import DatasetLoader

flow_path = pathlib.Path(flow_file)
loader = DatasetLoader(
    merged_store_path=str(flow_path.parent / f"{flow_path.stem}.dataset"),
    parquet_file_path=str(flow_path.parent / f"{flow_path.stem}.infoset")
)

# Split dataset by machining category with explicit group parameter
train_size, val_size, test_size = loader.split(
    key="part_label",
    group="Labels",  # Explicitly specify the group for clarity
    train=0.6,
    validation=0.2,
    test=0.2,
    random_state=42
)

print(f"Dataset split: Train={train_size}, Validation={val_size}, Test={test_size}")

# Access training dataset
train_dataset = loader.get_dataset("train")
print(f"Training dataset ready with {len(train_dataset)} samples")
[ ]:
from hoops_ai.ml.EXPERIMENTAL import FlowTrainer


flow_root_dir = nb_dir.joinpath("out","flows","ETL_Fabwave_training")
[ ]:
flow_trainer = FlowTrainer(

    flowmodel       = my_workflow_for_fabewave,
    datasetLoader   = loader,
    experiment_name = "HOOPS_AI_train",
    result_dir      = flow_root_dir,
    accelerator     = 'cpu',
    devices         = 1,
    max_epochs      = 30,
    batch_size      = 64
    )
[22]:
trained_model_path = flow_trainer.train()
print(f"Training finished. Model checkpoint saved in {trained_model_path}")
IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

`Trainer.fit` stopped: `max_epochs=30` reached.
Training finished. Model checkpoint saved in C:\Users\LuisSalazar\Documents\MAIN\MLProject\repo\HOOPS-AI-tutorials\notebooks\out\flows\ETL_Fabwave_training\ml_output\HOOPS_AI_train\1101\014808\best.ckpt
[23]:
## Testing phase
flow_trainer.test(trained_model_path)
print(f"Testing finished")
Restoring states from the checkpoint path at C:\Users\LuisSalazar\Documents\MAIN\MLProject\repo\HOOPS-AI-tutorials\notebooks\out\flows\ETL_Fabwave_training\ml_output\HOOPS_AI_train\1101\014808\best.ckpt

-----------------------------------------------------------------------------------
GRAPH CLASSIFICATION MODEL - TESTING STEP
-----------------------------------------------------------------------------------

The trained model: C:\Users\LuisSalazar\Documents\MAIN\MLProject\repo\HOOPS-AI-tutorials\notebooks\out\flows\ETL_Fabwave_training\ml_output\HOOPS_AI_train\1101\014808\best.ckpt

Test set contains 917 training samples
-----------------------------------------------------------------------------------

Loaded model weights from checkpoint at C:\Users\LuisSalazar\Documents\MAIN\MLProject\repo\HOOPS-AI-tutorials\notebooks\out\flows\ETL_Fabwave_training\ml_output\HOOPS_AI_train\1101\014808\best.ckpt
Number of classes: 45
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
           IoU              0.9566790352504638
    overall_accuracy             0.9921875
   per_class_accuracy       0.9844567179679871
        test_acc                 0.9921875
        test_loss           0.03970249742269516
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Testing finished
[24]:
explorer = DatasetExplorer(flow_output_file=str(flow_file))
explorer.print_table_of_contents()
[DatasetExplorer] Default local cluster started: <Client: 'tcp://127.0.0.1:60191' processes=1 threads=16, memory=7.45 GiB>
[DatasetExplorer] All resources closed.

 Dataset Table of Contents

LABELS_GROUP:
  FILE_ID_CODE_LABELS_DATA: Shape: (4546,), Dims: ('Labels_part_label_dim_0',), Size: 4546
  PART_LABEL_DATA: Shape: (4546,), Dims: ('Labels_part_label_dim_0',), Size: 4546

EDGES_GROUP:
  EDGE_CONVEXITIES_DATA: Shape: (337065,), Dims: ('edge',), Size: 337065
  EDGE_DIHEDRAL_ANGLES_DATA: Shape: (337065,), Dims: ('edge',), Size: 337065
  EDGE_INDICES_DATA: Shape: (337065,), Dims: ('edge',), Size: 337065
  EDGE_LENGTHS_DATA: Shape: (337065,), Dims: ('edge',), Size: 337065
  EDGE_TYPES_DATA: Shape: (337065,), Dims: ('edge',), Size: 337065
  EDGE_U_GRIDS_DATA: Shape: (337065, 10, 6), Dims: ('edge', 'u', 'component'), Size: 20223900
  FILE_ID_CODE_EDGES_DATA: Shape: (337065,), Dims: ('edge',), Size: 337065

FACES_GROUP:
  FACE_AREAS_DATA: Shape: (130923,), Dims: ('face',), Size: 130923
  FACE_DISCRETIZATION_DATA: Shape: (130923, 100, 7), Dims: ('face', 'sample', 'component'), Size: 91646100
  FACE_INDICES_DATA: Shape: (130923,), Dims: ('face',), Size: 130923
  FACE_LOOPS_DATA: Shape: (130923,), Dims: ('face',), Size: 130923
  FACE_TYPES_DATA: Shape: (130923,), Dims: ('face',), Size: 130923
  FILE_ID_CODE_FACES_DATA: Shape: (130923,), Dims: ('face',), Size: 130923

GRAPH_GROUP:
  EDGES_DESTINATION_DATA: Shape: (337065,), Dims: ('edge',), Size: 337065
  EDGES_SOURCE_DATA: Shape: (337065,), Dims: ('edge',), Size: 337065
  FILE_ID_CODE_GRAPH_DATA: Shape: (337065,), Dims: ('edge',), Size: 337065
  NUM_NODES_DATA: Shape: (337065,), Dims: ('edge',), Size: 337065
==================================
Columns in file_info:
                                  name    id                             description                        stream_cache_png                         stream_cache_3d      subset table_name
0     000757c971d4af379cff2bf219566f76     0  ...25a-d5c6-4a99-a7f1-efc5c122e442.stp  ...00757c971d4af379cff2bf219566f76.png  ...00757c971d4af379cff2bf219566f76.scs        test  file_info
1     000d2096f1b75775aee0cca3869062ac     1  ...376-4e32-4a62-8636-833a596c3a24.stp  ...00d2096f1b75775aee0cca3869062ac.png  ...00d2096f1b75775aee0cca3869062ac.scs        test  file_info
2     0018d103885051d4463bfbdf97642644     2  ...8a0-14fc-4bcc-94f5-7e206c7ab2eb.stp  ...018d103885051d4463bfbdf97642644.png  ...018d103885051d4463bfbdf97642644.scs        test  file_info
3     001ec9bc5e47900ff95873b20a8cf97e     3  ...s\STEP\sshettigarsleevewasher30.stp  ...01ec9bc5e47900ff95873b20a8cf97e.png  ...01ec9bc5e47900ff95873b20a8cf97e.scs        test  file_info
4     002b9953b2669f0dbf7e95a97d47e759     4  ...b39-9f03-490e-b962-55ec3761f456.stp  ...02b9953b2669f0dbf7e95a97d47e759.png  ...02b9953b2669f0dbf7e95a97d47e759.scs       train  file_info
5     0043ebf2efcc3849c5b61c45f4190369     5  ...aae-e8c6-40df-a29b-5770a2d5a73c.stp  ...043ebf2efcc3849c5b61c45f4190369.png  ...043ebf2efcc3849c5b61c45f4190369.scs       train  file_info
6     0045b381a0be8fbc8d49688a6479a076     6  ...ef6-87ef-45b6-8c9c-0414903668e3.stp  ...045b381a0be8fbc8d49688a6479a076.png  ...045b381a0be8fbc8d49688a6479a076.scs       train  file_info
7     005479bbb0a5df52f6d9cc2ad0f8253c     7  ...s\STEP\sshettigarsleevewasher44.stp  ...05479bbb0a5df52f6d9cc2ad0f8253c.png  ...05479bbb0a5df52f6d9cc2ad0f8253c.scs       train  file_info
8     0054a7cfbadf02dce98531f4dd7508a5     8  ...800-c9bd-4dd6-b425-30bf848fcd41.stp  ...054a7cfbadf02dce98531f4dd7508a5.png  ...054a7cfbadf02dce98531f4dd7508a5.scs       train  file_info
9     00a5e9cd26129c2757fe6a08407df0f9     9  ...dbe-97b8-46d4-b550-aa40dab589cd.stp  ...0a5e9cd26129c2757fe6a08407df0f9.png  ...0a5e9cd26129c2757fe6a08407df0f9.scs       train  file_info
...                                ...   ...                                     ...                                     ...                                     ...         ...        ...
4536  ff3d884bd5a5e96e491a4ea638d583e0  4536  ...5ae-1468-46f5-bce0-c959e3cbf870.stp  ...f3d884bd5a5e96e491a4ea638d583e0.png  ...f3d884bd5a5e96e491a4ea638d583e0.scs       train  file_info
4537  ff3fcef336d538c95a1f04ccf01339f1  4537  ...02-e51d-43d8-bd4b-171c3589485b.step  ...f3fcef336d538c95a1f04ccf01339f1.png  ...f3fcef336d538c95a1f04ccf01339f1.scs       train  file_info
4538  ff69cdbe147e5b2b231a48c668268d3b  4538  ...c3a-a37f-416c-8ddb-e0683a1a8926.stp  ...f69cdbe147e5b2b231a48c668268d3b.png  ...f69cdbe147e5b2b231a48c668268d3b.scs  validation  file_info
4539  ff860352bbd8c7af1b596835829288c1  4539  ...543-8b94-4dee-a8e5-fdc362a80359.stp  ...f860352bbd8c7af1b596835829288c1.png  ...f860352bbd8c7af1b596835829288c1.scs       train  file_info
4540  ff94b5f50c009fcc19e231c8dc4fef18  4540  ...lasses\Brackets\STEP\bracket217.stp  ...f94b5f50c009fcc19e231c8dc4fef18.png  ...f94b5f50c009fcc19e231c8dc4fef18.scs  validation  file_info
4541  ff96da3be8fa50b0c3516392e7c57770  4541  ...cdb-e812-42b6-a29f-5973b8b41042.stp  ...f96da3be8fa50b0c3516392e7c57770.png  ...f96da3be8fa50b0c3516392e7c57770.scs       train  file_info
4542  ffb399838ecd3d81f1e05edd13002887  4542  ...106-6aba-43ff-b412-4b97010388b2.stp  ...fb399838ecd3d81f1e05edd13002887.png  ...fb399838ecd3d81f1e05edd13002887.scs       train  file_info
4543  ffc34e21ca8c3b0aefc40057489b1db0  4543  ...30-ad62-4695-9e60-317db1586e81.step  ...fc34e21ca8c3b0aefc40057489b1db0.png  ...fc34e21ca8c3b0aefc40057489b1db0.scs        test  file_info
4544  ffd6cacefde868ff77c8057d55add378  4544  ...83d-2422-433d-88d1-0193c2c1471c.stp  ...fd6cacefde868ff77c8057d55add378.png  ...fd6cacefde868ff77c8057d55add378.scs  validation  file_info
4545  ffd8d4beab2e24522adc38308def99ec  4545  ...200-0d37-4719-b72d-b58833bef79a.stp  ...fd8d4beab2e24522adc38308def99ec.png  ...fd8d4beab2e24522adc38308def99ec.scs       train  file_info
[25]:
test_dataset = loader.get_dataset("test")
file_list = test_dataset.indices[:50]
print(file_list)
[  0   1   2   3  13  20  21  22  25  40  46  49  52  57  59  63  67  68
  74  78  82  86  99 105 108 110 111 117 124 126 128 133 136 142 146 147
 155 158 173 178 194 201 204 210 226 229 233 237 245 248]
[26]:
# Visualize the filtered files as a 5x5 grid with file IDs as labels
fig = dataset_viewer.show_preview_as_image(
    file_list,
    k=len(file_list),                      # Show up to 25 files
    grid_cols=8,               # 5 columns
    figsize=(15, 15)           # Larger figure size
)

plt.show()
../../../_images/tutorials_hoops_ai_tutorials_notebooks_3a_ETL_pipeline_using_flow_fabewave_35_0.png