From 148f87d93a5fb17c58f000e4784016309d21a732 Mon Sep 17 00:00:00 2001 From: eicchen Date: Wed, 16 Jul 2025 22:11:17 -0500 Subject: [PATCH 1/7] Initial testcase for read_csv --- .../io/parser/usecols/test_usecols_basic.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py index 82b42beb38ae0..1e4b7dbaa0167 100644 --- a/pandas/tests/io/parser/usecols/test_usecols_basic.py +++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py @@ -545,3 +545,27 @@ def test_usecols_dtype(all_parsers): {"col1": array(["a", "b"]), "col2": np.array([1, 2], dtype="uint8")} ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("usecols", [(2, 0), ("c", "a")]) +def test_usecols_order(all_parsers, usecols, request): + # TODO add future flag + parser = all_parsers + data = """\ +a,b,c,d +1,2,3,0 +4,5,6, +7,8,9,0 +10,11,12,13""" + # print(usecols) + # print(data) + + if parser.engine == "pyarrow" and isinstance(usecols[0], int): + with pytest.raises(ValueError, match=_msg_pyarrow_requires_names): + parser.read_csv(StringIO(data), usecols=usecols) + return + + result = parser.read_csv(StringIO(data), usecols=usecols) + + expected = DataFrame([[3, 1], [6, 4], [9, 7], [12, 10]], columns=["c", "a"]) + tm.assert_frame_equal(result, expected) From b94edd5c1e8ea18d35acdb1d7f2eaba9d048e0bb Mon Sep 17 00:00:00 2001 From: eicchen Date: Fri, 18 Jul 2025 17:36:10 -0500 Subject: [PATCH 2/7] Added missing 0 in testcase --- pandas/tests/io/parser/usecols/test_usecols_basic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py index 1e4b7dbaa0167..01dfa526044b4 100644 --- a/pandas/tests/io/parser/usecols/test_usecols_basic.py +++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py @@ -554,7 +554,7 @@ def test_usecols_order(all_parsers, usecols, request): data = """\ a,b,c,d 1,2,3,0 -4,5,6, +4,5,6,0 7,8,9,0 10,11,12,13""" # print(usecols) From 7971351dfc21007fc6d76997a03248886a267941 Mon Sep 17 00:00:00 2001 From: eicchen Date: Mon, 21 Jul 2025 15:45:20 -0500 Subject: [PATCH 3/7] Added simple implementation of usecols order for read_csv --- pandas/io/parsers/readers.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 4fbd71ed03662..5641e7948ce50 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -11,6 +11,7 @@ defaultdict, ) import csv +from inspect import isfunction import sys from textwrap import fill from typing import ( @@ -1516,8 +1517,10 @@ def read(self, nrows: int | None = None) -> DataFrame: if hasattr(self, "orig_options"): dtype_arg = self.orig_options.get("dtype", None) + usecols = self.orig_options["usecols"] else: dtype_arg = None + usecols = None if isinstance(dtype_arg, dict): dtype = defaultdict(lambda: None) # type: ignore[var-annotated] @@ -1530,6 +1533,18 @@ def read(self, nrows: int | None = None) -> DataFrame: else: dtype = None + if dtype is None: + if usecols is None or isfunction(usecols): + # Doesn't change anything if function or None gets passed + pass + elif len(usecols) == len(columns): + # uses size of number in usecols to determine corresponding columns + usecols_sorted = sorted( + range(len(usecols)), key=lambda i: usecols[i] + ) + columns = [columns[i] for i in usecols_sorted] + col_dict = {k: col_dict[k] for k in columns} + if dtype is not None: new_col_dict = {} for k, v in col_dict.items(): @@ -1548,7 +1563,6 @@ def read(self, nrows: int | None = None) -> DataFrame: index=index, copy=False, ) - self._currow += new_rows return df From e394592b8ed72f4c1cd084782b185861ae6d5b5f Mon Sep 17 00:00:00 2001 From: eicchen Date: Fri, 25 Jul 2025 17:06:11 -0500 Subject: [PATCH 4/7] Added future flag for usecols_use_order --- pandas/core/config_init.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 20fe8cbab1c9f..4e55aedbb2845 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -897,3 +897,13 @@ def register_converter_cb(key: str) -> None: "(at which point this option will be deprecated).", validator=is_one_of_factory([True, False]), ) + + cf.register_option( + "usecols_use_order", + False, + ": bool\n " + "Whether usecols parameter will use order of input when " + "making a DataFrame. \n This feature will be default in pandas 3.0" + "(at which point this option will be deprecated).", + validator=is_one_of_factory([True, False]), + ) From 8c0d2d47e4604bbd2a49455a244dbe717ac96184 Mon Sep 17 00:00:00 2001 From: eicchen Date: Fri, 25 Jul 2025 17:38:38 -0500 Subject: [PATCH 5/7] Added check for future flag, improved testcase to check flag and more complicated usecols order --- pandas/io/parsers/readers.py | 4 ++- .../io/parser/usecols/test_usecols_basic.py | 27 +++++++++++++------ 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 5641e7948ce50..29d3d0a9838b7 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -27,6 +27,8 @@ import numpy as np +from pandas._config import get_option + from pandas._libs import lib from pandas._libs.parsers import STR_NA_VALUES from pandas.errors import ( @@ -1533,7 +1535,7 @@ def read(self, nrows: int | None = None) -> DataFrame: else: dtype = None - if dtype is None: + if dtype is None and get_option("future.usecols_use_order"): if usecols is None or isfunction(usecols): # Doesn't change anything if function or None gets passed pass diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py index 01dfa526044b4..00a52e1f3a18c 100644 --- a/pandas/tests/io/parser/usecols/test_usecols_basic.py +++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config.config import option_context + from pandas.errors import ParserError from pandas import ( @@ -547,9 +549,10 @@ def test_usecols_dtype(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("usecols", [(2, 0), ("c", "a")]) -def test_usecols_order(all_parsers, usecols, request): - # TODO add future flag +@pytest.mark.parametrize("usecols", [(3, 0, 2), ("d", "a", "c")]) +@pytest.mark.parametrize("usecols_use_order", (True, False)) +def test_usecols_order(all_parsers, usecols, usecols_use_order): + # TODOE add portion in doc for 3.0 transition parser = all_parsers data = """\ a,b,c,d @@ -557,15 +560,23 @@ def test_usecols_order(all_parsers, usecols, request): 4,5,6,0 7,8,9,0 10,11,12,13""" - # print(usecols) - # print(data) + msg = "The pyarrow engine does not allow 'usecols' to be integer column positions" if parser.engine == "pyarrow" and isinstance(usecols[0], int): - with pytest.raises(ValueError, match=_msg_pyarrow_requires_names): + with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), usecols=usecols) return result = parser.read_csv(StringIO(data), usecols=usecols) - expected = DataFrame([[3, 1], [6, 4], [9, 7], [12, 10]], columns=["c", "a"]) - tm.assert_frame_equal(result, expected) + if usecols_use_order: + expected = DataFrame( + {"d": [0, 0, 0, 13], "a": [1, 4, 7, 10], "c": [3, 6, 9, 12]} + ) + else: + expected = DataFrame( + {"a": [1, 4, 7, 10], "c": [3, 6, 9, 12], "d": [0, 0, 0, 13]} + ) + + with option_context("future.usecols_use_order", usecols_use_order): + tm.assert_frame_equal(result, expected) From 7ec20a24faa509589b5b5666f416b62d84f0f033 Mon Sep 17 00:00:00 2001 From: eicchen Date: Fri, 25 Jul 2025 18:25:19 -0500 Subject: [PATCH 6/7] Fixed issue with reading out of order lists, added exception for pyarrow. --- pandas/io/parsers/readers.py | 9 ++++---- .../io/parser/usecols/test_usecols_basic.py | 21 ++++++++++++------- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 29d3d0a9838b7..847259403c72c 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1535,16 +1535,15 @@ def read(self, nrows: int | None = None) -> DataFrame: else: dtype = None - if dtype is None and get_option("future.usecols_use_order"): + if get_option("future.usecols_use_order"): if usecols is None or isfunction(usecols): # Doesn't change anything if function or None gets passed pass elif len(usecols) == len(columns): # uses size of number in usecols to determine corresponding columns - usecols_sorted = sorted( - range(len(usecols)), key=lambda i: usecols[i] - ) - columns = [columns[i] for i in usecols_sorted] + value_ranked = {v: i for i, v in enumerate(sorted(usecols))} + usecols_pressed = [value_ranked[v] for v in usecols] + columns = [columns[i] for i in usecols_pressed] col_dict = {k: col_dict[k] for k in columns} if dtype is not None: diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py index 00a52e1f3a18c..e09f88ba3f113 100644 --- a/pandas/tests/io/parser/usecols/test_usecols_basic.py +++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py @@ -554,6 +554,7 @@ def test_usecols_dtype(all_parsers): def test_usecols_order(all_parsers, usecols, usecols_use_order): # TODOE add portion in doc for 3.0 transition parser = all_parsers + pyarrow_flag = False data = """\ a,b,c,d 1,2,3,0 @@ -561,15 +562,18 @@ def test_usecols_order(all_parsers, usecols, usecols_use_order): 7,8,9,0 10,11,12,13""" - msg = "The pyarrow engine does not allow 'usecols' to be integer column positions" - if parser.engine == "pyarrow" and isinstance(usecols[0], int): - with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), usecols=usecols) - return - - result = parser.read_csv(StringIO(data), usecols=usecols) + if parser.engine == "pyarrow": + if isinstance(usecols[0], int): + msg = "The pyarrow engine does not allow 'usecols' to be integer column" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), usecols=usecols) + return + else: + # looks like pyarrow already considers column order by default. + # Modifies test to account for it in selecting expected df + pyarrow_flag = True - if usecols_use_order: + if usecols_use_order or pyarrow_flag: expected = DataFrame( {"d": [0, 0, 0, 13], "a": [1, 4, 7, 10], "c": [3, 6, 9, 12]} ) @@ -579,4 +583,5 @@ def test_usecols_order(all_parsers, usecols, usecols_use_order): ) with option_context("future.usecols_use_order", usecols_use_order): + result = parser.read_csv(StringIO(data), usecols=usecols) tm.assert_frame_equal(result, expected) From df37372acf871a54978886407b1141014bb43c19 Mon Sep 17 00:00:00 2001 From: eicchen Date: Fri, 25 Jul 2025 20:31:59 -0500 Subject: [PATCH 7/7] fixed issues where usecols became a required input --- pandas/io/parsers/readers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 847259403c72c..7d345791b5a7d 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1519,7 +1519,7 @@ def read(self, nrows: int | None = None) -> DataFrame: if hasattr(self, "orig_options"): dtype_arg = self.orig_options.get("dtype", None) - usecols = self.orig_options["usecols"] + usecols = self.orig_options.get("usecols", None) else: dtype_arg = None usecols = None