Skip to content

Commit e534472

Browse files
authored
Add numpy to the mypy pre-commit environment (#20282)
Contributes to #11661 Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Kyle Edwards (https://github.com/KyleFromNVIDIA) - Bradley Dice (https://github.com/bdice) URL: #20282
1 parent 73d722e commit e534472

File tree

28 files changed

+183
-91
lines changed

28 files changed

+183
-91
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ repos:
3333
rev: 'v1.13.0'
3434
hooks:
3535
- id: mypy
36-
additional_dependencies: [types-cachetools, pyarrow-stubs]
36+
additional_dependencies: [types-cachetools, pyarrow-stubs, numpy]
3737
args: ["--config-file=pyproject.toml",
3838
"python/cudf/cudf",
3939
"python/pylibcudf/pylibcudf",

python/cudf/cudf/core/_internals/timezones.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ def _read_tzfile_as_columns(
121121
from cudf.core.column.column import as_column
122122

123123
# this happens for UTC-like zones
124-
min_date = np.int64(np.iinfo("int64").min + 1).astype(
124+
min_date: np.datetime64 = np.int64(np.iinfo("int64").min + 1).astype(
125125
np.dtype("M8[s]")
126126
)
127127
return (as_column([min_date]), as_column([np.timedelta64(0, "s")])) # type: ignore[return-value]

python/cudf/cudf/core/accessors/string.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4735,7 +4735,7 @@ def character_ngrams(
47354735
return result
47364736

47374737
def hash_character_ngrams(
4738-
self, n: int = 5, as_list: bool = False, seed: np.uint32 = 0
4738+
self, n: int = 5, as_list: bool = False, seed: int | np.uint32 = 0
47394739
) -> Series | Index:
47404740
"""
47414741
Generate hashes of n-grams from characters in a column of strings.
@@ -5326,7 +5326,7 @@ def minhash(
53265326
return self.minhash64(seed, a_column, b_column, width)
53275327

53285328
def minhash64(
5329-
self, seed: np.uint64, a: ColumnLike, b: ColumnLike, width: int
5329+
self, seed: int | np.uint64, a: ColumnLike, b: ColumnLike, width: int
53305330
) -> Series | Index:
53315331
"""
53325332
Compute the minhash of a strings column.
@@ -5377,7 +5377,7 @@ def minhash64(
53775377
)
53785378

53795379
def minhash_ngrams(
5380-
self, ngrams: int, seed: np.uint32, a: ColumnLike, b: ColumnLike
5380+
self, ngrams: int, seed: int | np.uint32, a: ColumnLike, b: ColumnLike
53815381
) -> Series | Index:
53825382
"""
53835383
Compute the minhash of a list column of strings.
@@ -5428,7 +5428,7 @@ def minhash_ngrams(
54285428
)
54295429

54305430
def minhash64_ngrams(
5431-
self, ngrams: int, seed: np.uint64, a: ColumnLike, b: ColumnLike
5431+
self, ngrams: int, seed: int | np.uint64, a: ColumnLike, b: ColumnLike
54325432
) -> Series | Index:
54335433
"""
54345434
Compute the minhash of a list column of strings.

python/cudf/cudf/core/column/column.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1771,15 +1771,15 @@ def astype(self, dtype: DtypeObj, copy: bool | None = False) -> ColumnBase:
17711771
if isinstance(dtype, CategoricalDtype):
17721772
result = self.as_categorical_column(dtype)
17731773
elif is_dtype_obj_interval(dtype):
1774-
result = self.as_interval_column(dtype)
1774+
result = self.as_interval_column(dtype) # type: ignore[arg-type]
17751775
elif is_dtype_obj_list(dtype) or is_dtype_obj_struct(dtype):
17761776
if self.dtype != dtype:
17771777
raise NotImplementedError(
17781778
f"Casting {self.dtype} columns not currently supported"
17791779
)
17801780
result = self
17811781
elif is_dtype_obj_decimal(dtype):
1782-
result = self.as_decimal_column(dtype)
1782+
result = self.as_decimal_column(dtype) # type: ignore[arg-type]
17831783
elif dtype.kind == "M":
17841784
result = self.as_datetime_column(dtype)
17851785
elif dtype.kind == "m":
@@ -2301,8 +2301,10 @@ def reduce(self, reduction_op: str, **kwargs) -> ScalarLike:
23012301
plc.TypeId.DECIMAL32,
23022302
}:
23032303
scale = -plc_scalar.type().scale()
2304+
# Narrow type for mypy - we know col_dtype is a decimal type from the check above
2305+
assert isinstance(col_dtype, DecimalDtype)
2306+
p = col_dtype.precision
23042307
# https://docs.microsoft.com/en-us/sql/t-sql/data-types/precision-scale-and-length-transact-sql
2305-
p = col_dtype.precision # type: ignore[union-attr]
23062308
nrows = len(self)
23072309
if reduction_op in {"min", "max"}:
23082310
new_p = p
@@ -2316,7 +2318,7 @@ def reduce(self, reduction_op: str, **kwargs) -> ScalarLike:
23162318
raise NotImplementedError(
23172319
f"{reduction_op} not implemented for decimal types."
23182320
)
2319-
precision = max(min(new_p, col_dtype.MAX_PRECISION), 0) # type: ignore[union-attr]
2321+
precision = max(min(new_p, col_dtype.MAX_PRECISION), 0)
23202322
new_dtype = type(col_dtype)(precision, scale)
23212323
result_col = result_col.astype(new_dtype)
23222324
elif isinstance(col_dtype, IntervalDtype):

python/cudf/cudf/core/column/decimal.py

Lines changed: 33 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -118,19 +118,31 @@ def _from_32_64_arrow(
118118
if isinstance(data, pa.ChunkedArray):
119119
data = data.combine_chunks()
120120
mask_buf, data_buf = data.buffers()
121-
rmm_data_buffer = rmm.DeviceBuffer.to_device(
122-
np.frombuffer(data_buf)
123-
.view(view_type)[::step]
124-
.copy()
125-
.view("uint8")
126-
)
127-
plc_column = plc.Column.from_rmm_buffer(
128-
rmm_data_buffer,
129-
plc.DataType(plc_type, -data.type.scale),
130-
len(data),
131-
[],
132-
)
133-
if mask_buf is not None:
121+
if data_buf is None:
122+
# If data_buf is None, create an empty column
123+
plc_column = plc.Column(
124+
data_type=plc.DataType(plc_type, -data.type.scale),
125+
size=0,
126+
data=None,
127+
mask=None,
128+
null_count=0,
129+
offset=0,
130+
children=[],
131+
)
132+
else:
133+
rmm_data_buffer = rmm.DeviceBuffer.to_device(
134+
np.frombuffer(data_buf)
135+
.view(view_type)[::step]
136+
.copy()
137+
.view("uint8")
138+
)
139+
plc_column = plc.Column.from_rmm_buffer(
140+
rmm_data_buffer,
141+
plc.DataType(plc_type, -data.type.scale),
142+
len(data),
143+
[],
144+
)
145+
if mask_buf is not None and data_buf is not None:
134146
mask_size = plc.null_mask.bitmask_allocation_size_bytes(len(data))
135147
if mask_buf.size < mask_size:
136148
rmm_mask_buffer = rmm.DeviceBuffer(size=mask_size)
@@ -391,7 +403,9 @@ def from_arrow(cls, data: pa.Array | pa.ChunkedArray) -> Self:
391403

392404
def to_arrow(self) -> pa.Array:
393405
data_buf_32 = np.array(self.base_data.memoryview()).view("int32") # type: ignore[union-attr]
394-
data_buf_128 = np.empty(len(data_buf_32) * 4, dtype="int32")
406+
data_buf_128: np.ndarray = np.empty(
407+
len(data_buf_32) * 4, dtype="int32"
408+
)
395409

396410
# use striding to set the first 32 bits of each 128-bit chunk:
397411
data_buf_128[::4] = data_buf_32
@@ -464,8 +478,9 @@ def from_arrow(cls, data: pa.Array | pa.ChunkedArray) -> Self:
464478
return result
465479

466480
def to_arrow(self) -> pa.Array:
481+
dtype: Decimal128Dtype
467482
if isinstance(self.dtype, pd.ArrowDtype):
468-
dtype = pyarrow_dtype_to_cudf_dtype(self.dtype)
483+
dtype = pyarrow_dtype_to_cudf_dtype(self.dtype) # type: ignore[assignment]
469484
else:
470485
dtype = self.dtype
471486

@@ -510,7 +525,9 @@ def from_arrow(cls, data: pa.Array | pa.ChunkedArray) -> Self:
510525

511526
def to_arrow(self) -> pa.Array:
512527
data_buf_64 = np.array(self.base_data.memoryview()).view("int64") # type: ignore[union-attr]
513-
data_buf_128 = np.empty(len(data_buf_64) * 2, dtype="int64")
528+
data_buf_128: np.ndarray = np.empty(
529+
len(data_buf_64) * 2, dtype="int64"
530+
)
514531

515532
# use striding to set the first 64 bits of each 128-bit chunk:
516533
data_buf_128[::2] = data_buf_64

python/cudf/cudf/core/column/lists.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -526,10 +526,17 @@ def join_list_elements(
526526
def minhash_ngrams(
527527
self,
528528
width: int,
529-
seed: np.uint32,
529+
seed: int | np.uint32,
530530
a: NumericalColumn,
531531
b: NumericalColumn,
532532
) -> Self:
533+
# Convert int to np.uint32 with validation
534+
if isinstance(seed, int):
535+
if seed < 0 or seed > np.iinfo(np.uint32).max:
536+
raise ValueError(
537+
f"seed must be in range [0, {np.iinfo(np.uint32).max}]"
538+
)
539+
seed = np.uint32(seed)
533540
return type(self).from_pylibcudf( # type: ignore[return-value]
534541
plc.nvtext.minhash.minhash_ngrams(
535542
self.to_pylibcudf(mode="read"),
@@ -544,10 +551,17 @@ def minhash_ngrams(
544551
def minhash64_ngrams(
545552
self,
546553
width: int,
547-
seed: np.uint64,
554+
seed: int | np.uint64,
548555
a: NumericalColumn,
549556
b: NumericalColumn,
550557
) -> Self:
558+
# Convert int to np.uint64 with validation
559+
if isinstance(seed, int):
560+
if seed < 0 or seed > np.iinfo(np.uint64).max:
561+
raise ValueError(
562+
f"seed must be in range [0, {np.iinfo(np.uint64).max}]"
563+
)
564+
seed = np.uint64(seed)
551565
return type(self).from_pylibcudf( # type: ignore[return-value]
552566
plc.nvtext.minhash.minhash64_ngrams(
553567
self.to_pylibcudf(mode="read"),

python/cudf/cudf/core/column/numerical.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -821,7 +821,9 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
821821
# Kinds are the same but to_dtype is smaller
822822
if "float" in to_dtype_numpy.name:
823823
finfo = np.finfo(to_dtype_numpy)
824-
lower_, upper_ = finfo.min, finfo.max
824+
lower_: int | float
825+
upper_: int | float
826+
lower_, upper_ = finfo.min, finfo.max # type: ignore[assignment]
825827

826828
# Check specifically for np.pi values when casting to lower precision
827829
if self_dtype_numpy.itemsize > to_dtype_numpy.itemsize:

python/cudf/cudf/core/column/numerical_base.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -86,12 +86,12 @@ def kurtosis(self, skipna: bool | None = None) -> float:
8686
skipna = True if skipna is None else skipna
8787

8888
if len(self) == 0 or self._can_return_nan(skipna=skipna):
89-
return _get_nan_for_dtype(self.dtype)
89+
return _get_nan_for_dtype(self.dtype) # type: ignore[return-value]
9090

9191
self = self.nans_to_nulls().dropna()
9292

9393
if len(self) < 4:
94-
return _get_nan_for_dtype(self.dtype)
94+
return _get_nan_for_dtype(self.dtype) # type: ignore[return-value]
9595

9696
n = len(self)
9797
miu = self.mean()
@@ -178,7 +178,7 @@ def quantile(
178178
except (TypeError, ValueError):
179179
pass
180180
return (
181-
_get_nan_for_dtype(self.dtype)
181+
_get_nan_for_dtype(self.dtype) # type: ignore[return-value]
182182
if scalar_result is NA
183183
else scalar_result
184184
)
@@ -221,7 +221,7 @@ def median(self, skipna: bool | None = None) -> NumericalBaseColumn:
221221
skipna = True if skipna is None else skipna
222222

223223
if self._can_return_nan(skipna=skipna):
224-
return _get_nan_for_dtype(self.dtype)
224+
return _get_nan_for_dtype(self.dtype) # type: ignore[return-value]
225225

226226
# enforce linear in case the default ever changes
227227
result = self.quantile(
@@ -240,21 +240,21 @@ def cov(self, other: NumericalBaseColumn) -> float:
240240
or len(other) == 0
241241
or (len(self) == 1 and len(other) == 1)
242242
):
243-
return _get_nan_for_dtype(self.dtype)
243+
return _get_nan_for_dtype(self.dtype) # type: ignore[return-value]
244244

245245
result = (self - self.mean()) * (other - other.mean())
246246
cov_sample = result.sum() / (len(self) - 1)
247247
return cov_sample
248248

249249
def corr(self, other: NumericalBaseColumn) -> float:
250250
if len(self) == 0 or len(other) == 0:
251-
return _get_nan_for_dtype(self.dtype)
251+
return _get_nan_for_dtype(self.dtype) # type: ignore[return-value]
252252

253253
cov = self.cov(other)
254254
lhs_std, rhs_std = self.std(), other.std()
255255

256256
if not cov or lhs_std == 0 or rhs_std == 0:
257-
return _get_nan_for_dtype(self.dtype)
257+
return _get_nan_for_dtype(self.dtype) # type: ignore[return-value]
258258
return cov / lhs_std / rhs_std
259259

260260
def round(

python/cudf/cudf/core/column/string.py

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -629,11 +629,18 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
629629
@acquire_spill_lock()
630630
def minhash(
631631
self,
632-
seed: np.uint32,
632+
seed: int | np.uint32,
633633
a: NumericalColumn,
634634
b: NumericalColumn,
635635
width: int,
636636
) -> ListColumn:
637+
# Convert int to np.uint32 with validation
638+
if isinstance(seed, int):
639+
if seed < 0 or seed > np.iinfo(np.uint32).max:
640+
raise ValueError(
641+
f"seed must be in range [0, {np.iinfo(np.uint32).max}]"
642+
)
643+
seed = np.uint32(seed)
637644
return type(self).from_pylibcudf( # type: ignore[return-value]
638645
plc.nvtext.minhash.minhash(
639646
self.to_pylibcudf(mode="read"),
@@ -647,11 +654,18 @@ def minhash(
647654
@acquire_spill_lock()
648655
def minhash64(
649656
self,
650-
seed: np.uint64,
657+
seed: int | np.uint64,
651658
a: NumericalColumn,
652659
b: NumericalColumn,
653660
width: int,
654661
) -> ListColumn:
662+
# Convert int to np.uint64 with validation
663+
if isinstance(seed, int):
664+
if seed < 0 or seed > np.iinfo(np.uint64).max:
665+
raise ValueError(
666+
f"seed must be in range [0, {np.iinfo(np.uint64).max}]"
667+
)
668+
seed = np.uint64(seed)
655669
return type(self).from_pylibcudf( # type: ignore[return-value]
656670
plc.nvtext.minhash.minhash64(
657671
self.to_pylibcudf(mode="read"),
@@ -689,8 +703,15 @@ def generate_character_ngrams(self, ngrams: int) -> ListColumn:
689703

690704
@acquire_spill_lock()
691705
def hash_character_ngrams(
692-
self, ngrams: int, seed: np.uint32
706+
self, ngrams: int, seed: int | np.uint32
693707
) -> ListColumn:
708+
# Convert int to np.uint32 with validation
709+
if isinstance(seed, int):
710+
if seed < 0 or seed > np.iinfo(np.uint32).max:
711+
raise ValueError(
712+
f"seed must be in range [0, {np.iinfo(np.uint32).max}]"
713+
)
714+
seed = np.uint32(seed)
694715
result = plc.nvtext.generate_ngrams.hash_character_ngrams(
695716
self.to_pylibcudf(mode="read"), ngrams, seed
696717
)

python/cudf/cudf/core/column/struct.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,8 +107,8 @@ def base_size(self) -> int:
107107

108108
def to_arrow(self) -> pa.Array:
109109
children = [child.to_arrow() for child in self.children]
110-
dtype = (
111-
pyarrow_dtype_to_cudf_dtype(self.dtype)
110+
dtype: StructDtype = (
111+
pyarrow_dtype_to_cudf_dtype(self.dtype) # type: ignore[assignment]
112112
if isinstance(self.dtype, pd.ArrowDtype)
113113
else self.dtype
114114
)

0 commit comments

Comments
 (0)