|
| 1 | +# |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | +# |
| 4 | +# Copyright (c) 2025 Intel Corporation |
| 5 | +# |
| 6 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 7 | +# you may not use this file except in compliance with the License. |
| 8 | +# You may obtain a copy of the License at |
| 9 | +# |
| 10 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 11 | +# |
| 12 | +# Unless required by applicable law or agreed to in writing, software |
| 13 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 14 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 15 | +# See the License for the specific language governing permissions and |
| 16 | +# limitations under the License. |
| 17 | +# |
| 18 | + |
| 19 | +#!/usr/bin/env python3 |
| 20 | +# Copyright (c) Meta Platforms, Inc. and affiliates. |
| 21 | +# |
| 22 | +# This source code is licensed under the MIT license found in the |
| 23 | +# LICENSE file in the root directory of this source tree. |
| 24 | + |
| 25 | +import argparse |
| 26 | +import os |
| 27 | +from typing import List |
| 28 | + |
| 29 | +from torch import distributed as dist |
| 30 | +from torch.utils.data import DataLoader |
| 31 | +from torchrec.datasets.criteo import ( |
| 32 | + CAT_FEATURE_COUNT, |
| 33 | + DAYS, |
| 34 | + DEFAULT_CAT_NAMES, |
| 35 | + DEFAULT_INT_NAMES, |
| 36 | + InMemoryBinaryCriteoIterDataPipe, |
| 37 | +) |
| 38 | +from torchrec.datasets.random import RandomRecDataset |
| 39 | + |
| 40 | +# OSS import |
| 41 | +try: |
| 42 | + # pyre-ignore[21] |
| 43 | + # @manual=//ai_codesign/benchmarks/dlrm/torchrec_dlrm/data:multi_hot_criteo |
| 44 | + from data.multi_hot_criteo import MultiHotCriteoIterDataPipe |
| 45 | + |
| 46 | +except ImportError: |
| 47 | + pass |
| 48 | + |
| 49 | +# internal import |
| 50 | +try: |
| 51 | + from .multi_hot_criteo import MultiHotCriteoIterDataPipe # noqa F811 |
| 52 | +except ImportError: |
| 53 | + pass |
| 54 | + |
| 55 | +STAGES = ["train", "val", "test"] |
| 56 | + |
| 57 | + |
| 58 | +def _get_random_dataloader( |
| 59 | + args: argparse.Namespace, |
| 60 | + stage: str, |
| 61 | +) -> DataLoader: |
| 62 | + attr = f"limit_{stage}_batches" |
| 63 | + num_batches = getattr(args, attr) |
| 64 | + if stage in ["val", "test"] and args.test_batch_size is not None: |
| 65 | + batch_size = args.test_batch_size |
| 66 | + else: |
| 67 | + batch_size = args.batch_size |
| 68 | + return DataLoader( |
| 69 | + RandomRecDataset( |
| 70 | + keys=DEFAULT_CAT_NAMES, |
| 71 | + batch_size=batch_size, |
| 72 | + hash_size=args.num_embeddings, |
| 73 | + hash_sizes=( |
| 74 | + args.num_embeddings_per_feature |
| 75 | + if hasattr(args, "num_embeddings_per_feature") |
| 76 | + else None |
| 77 | + ), |
| 78 | + manual_seed=args.seed if hasattr(args, "seed") else None, |
| 79 | + ids_per_feature=1, |
| 80 | + num_dense=len(DEFAULT_INT_NAMES), |
| 81 | + num_batches=num_batches, |
| 82 | + ), |
| 83 | + batch_size=None, |
| 84 | + batch_sampler=None, |
| 85 | + pin_memory=args.pin_memory, |
| 86 | + num_workers=0, |
| 87 | + ) |
| 88 | + |
| 89 | + |
| 90 | +def _get_in_memory_dataloader( |
| 91 | + args: argparse.Namespace, |
| 92 | + stage: str, |
| 93 | +) -> DataLoader: |
| 94 | + dir_path = args.data_path |
| 95 | + sparse_part = "sparse_multi_hot.npz" |
| 96 | + datapipe = MultiHotCriteoIterDataPipe |
| 97 | + |
| 98 | + if stage == "train": |
| 99 | + stage_files: List[List[str]] = [ |
| 100 | + [os.path.join(dir_path, f"day_{i}_dense.npy") for i in range(DAYS - 1)], |
| 101 | + [os.path.join(dir_path, f"day_{i}_{sparse_part}") for i in range(DAYS - 1)], |
| 102 | + [os.path.join(dir_path, f"day_{i}_labels.npy") for i in range(DAYS - 1)], |
| 103 | + ] |
| 104 | + elif stage in ["val", "test"]: |
| 105 | + stage_files: List[List[str]] = [ |
| 106 | + [os.path.join(dir_path, f"day_{DAYS-1}_dense.npy")], |
| 107 | + [os.path.join(dir_path, f"day_{DAYS-1}_{sparse_part}")], |
| 108 | + [os.path.join(dir_path, f"day_{DAYS-1}_labels.npy")], |
| 109 | + ] |
| 110 | + if stage in ["val", "test"] and args.test_batch_size is not None: |
| 111 | + batch_size = args.test_batch_size |
| 112 | + else: |
| 113 | + batch_size = args.batch_size |
| 114 | + dataloader = DataLoader( |
| 115 | + datapipe( |
| 116 | + stage, |
| 117 | + *stage_files, # pyre-ignore[6] |
| 118 | + batch_size=batch_size, |
| 119 | + rank=0, # dist.get_rank(), |
| 120 | + world_size=1, # dist.get_world_size(), |
| 121 | + drop_last=args.drop_last_training_batch if stage == "train" else False, |
| 122 | + shuffle_batches=args.shuffle_batches, |
| 123 | + shuffle_training_set=args.shuffle_training_set, |
| 124 | + shuffle_training_set_random_seed=args.seed, |
| 125 | + mmap_mode=args.mmap_mode, |
| 126 | + hashes=( |
| 127 | + args.num_embeddings_per_feature |
| 128 | + if args.num_embeddings is None |
| 129 | + else ([args.num_embeddings] * CAT_FEATURE_COUNT) |
| 130 | + ), |
| 131 | + ), |
| 132 | + batch_size=None, |
| 133 | + pin_memory=args.pin_memory, |
| 134 | + collate_fn=lambda x: x, |
| 135 | + ) |
| 136 | + return dataloader |
| 137 | + |
| 138 | + |
| 139 | +def get_dataloader(args: argparse.Namespace, backend: str, stage: str) -> DataLoader: |
| 140 | + """ |
| 141 | + Gets desired dataloader from dlrm_main command line options. Currently, this |
| 142 | + function is able to return either a DataLoader wrapped around a RandomRecDataset or |
| 143 | + a Dataloader wrapped around an InMemoryBinaryCriteoIterDataPipe. |
| 144 | +
|
| 145 | + Args: |
| 146 | + args (argparse.Namespace): Command line options supplied to dlrm_main.py's main |
| 147 | + function. |
| 148 | + backend (str): "nccl" or "gloo". |
| 149 | + stage (str): "train", "val", or "test". |
| 150 | +
|
| 151 | + Returns: |
| 152 | + dataloader (DataLoader): PyTorch dataloader for the specified options. |
| 153 | +
|
| 154 | + """ |
| 155 | + stage = stage.lower() |
| 156 | + if stage not in STAGES: |
| 157 | + raise ValueError(f"Supplied stage was {stage}. Must be one of {STAGES}.") |
| 158 | + |
| 159 | + args.pin_memory = ( |
| 160 | + (backend == "nccl") if not hasattr(args, "pin_memory") else args.pin_memory |
| 161 | + ) |
| 162 | + |
| 163 | + return _get_in_memory_dataloader(args, stage) |
0 commit comments