Skip to content

ENH: usecols takes input order for read_csv implementation review #61967

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions pandas/core/config_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -897,3 +897,13 @@ def register_converter_cb(key: str) -> None:
"(at which point this option will be deprecated).",
validator=is_one_of_factory([True, False]),
)

cf.register_option(
"usecols_use_order",
False,
": bool\n "
"Whether usecols parameter will use order of input when "
"making a DataFrame. \n This feature will be default in pandas 3.0"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think if this option is being introduced in 3.0 it won't be enforced until 4.0

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good, is that the only concern with the implementation? If so I'll go ahead and apply it to other functions and update the docs.

I can also add modifying the flag to 4.0 milestones too, idk if there is a timeline for it just yet but figured better to add it while it's fresh

"(at which point this option will be deprecated).",
validator=is_one_of_factory([True, False]),
)
17 changes: 16 additions & 1 deletion pandas/io/parsers/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
defaultdict,
)
import csv
from inspect import isfunction
import sys
from textwrap import fill
from typing import (
Expand All @@ -26,6 +27,8 @@

import numpy as np

from pandas._config import get_option

from pandas._libs import lib
from pandas._libs.parsers import STR_NA_VALUES
from pandas.errors import (
Expand Down Expand Up @@ -1516,8 +1519,10 @@ def read(self, nrows: int | None = None) -> DataFrame:

if hasattr(self, "orig_options"):
dtype_arg = self.orig_options.get("dtype", None)
usecols = self.orig_options.get("usecols", None)
else:
dtype_arg = None
usecols = None

if isinstance(dtype_arg, dict):
dtype = defaultdict(lambda: None) # type: ignore[var-annotated]
Expand All @@ -1530,6 +1535,17 @@ def read(self, nrows: int | None = None) -> DataFrame:
else:
dtype = None

if get_option("future.usecols_use_order"):
if usecols is None or isfunction(usecols):
# Doesn't change anything if function or None gets passed
pass
elif len(usecols) == len(columns):
# uses size of number in usecols to determine corresponding columns
value_ranked = {v: i for i, v in enumerate(sorted(usecols))}
usecols_pressed = [value_ranked[v] for v in usecols]
columns = [columns[i] for i in usecols_pressed]
col_dict = {k: col_dict[k] for k in columns}

if dtype is not None:
new_col_dict = {}
for k, v in col_dict.items():
Expand All @@ -1548,7 +1564,6 @@ def read(self, nrows: int | None = None) -> DataFrame:
index=index,
copy=False,
)

self._currow += new_rows
return df

Expand Down
40 changes: 40 additions & 0 deletions pandas/tests/io/parser/usecols/test_usecols_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import numpy as np
import pytest

from pandas._config.config import option_context

from pandas.errors import ParserError

from pandas import (
Expand Down Expand Up @@ -545,3 +547,41 @@ def test_usecols_dtype(all_parsers):
{"col1": array(["a", "b"]), "col2": np.array([1, 2], dtype="uint8")}
)
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("usecols", [(3, 0, 2), ("d", "a", "c")])
@pytest.mark.parametrize("usecols_use_order", (True, False))
def test_usecols_order(all_parsers, usecols, usecols_use_order):
# TODOE add portion in doc for 3.0 transition
parser = all_parsers
pyarrow_flag = False
data = """\
a,b,c,d
1,2,3,0
4,5,6,0
7,8,9,0
10,11,12,13"""

if parser.engine == "pyarrow":
if isinstance(usecols[0], int):
msg = "The pyarrow engine does not allow 'usecols' to be integer column"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), usecols=usecols)
return
else:
# looks like pyarrow already considers column order by default.
# Modifies test to account for it in selecting expected df
pyarrow_flag = True

if usecols_use_order or pyarrow_flag:
expected = DataFrame(
{"d": [0, 0, 0, 13], "a": [1, 4, 7, 10], "c": [3, 6, 9, 12]}
)
else:
expected = DataFrame(
{"a": [1, 4, 7, 10], "c": [3, 6, 9, 12], "d": [0, 0, 0, 13]}
)

with option_context("future.usecols_use_order", usecols_use_order):
result = parser.read_csv(StringIO(data), usecols=usecols)
tm.assert_frame_equal(result, expected)
Loading