Skip to content

Commit 3de457f

Browse files
committed
IO/Stata: clarify error when input is not a Stata dataset; add GitHub Raw URL hint
1 parent 54ab806 commit 3de457f

File tree

2 files changed

+66
-13
lines changed

2 files changed

+66
-13
lines changed

pandas/io/stata.py

Lines changed: 42 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2218,19 +2218,48 @@ def read_stata(
22182218
... # Operate on a single chunk, e.g., chunk.mean()
22192219
... pass # doctest: +SKIP
22202220
"""
2221-
reader = StataReader(
2222-
filepath_or_buffer,
2223-
convert_dates=convert_dates,
2224-
convert_categoricals=convert_categoricals,
2225-
index_col=index_col,
2226-
convert_missing=convert_missing,
2227-
preserve_dtypes=preserve_dtypes,
2228-
columns=columns,
2229-
order_categoricals=order_categoricals,
2230-
chunksize=chunksize,
2231-
storage_options=storage_options,
2232-
compression=compression,
2233-
)
2221+
try:
2222+
reader = StataReader(
2223+
filepath_or_buffer,
2224+
convert_dates=convert_dates,
2225+
convert_categoricals=convert_categoricals,
2226+
index_col=index_col,
2227+
convert_missing=convert_missing,
2228+
preserve_dtypes=preserve_dtypes,
2229+
columns=columns,
2230+
order_categoricals=order_categoricals,
2231+
chunksize=chunksize,
2232+
storage_options=storage_options,
2233+
compression=compression,
2234+
)
2235+
except ValueError as e:
2236+
# If users pass HTML/JSON/etc. (e.g., a GitHub page URL), StataReader
2237+
# often raises a version/format ValueError. Replace with a clearer message.
2238+
msg = str(e)
2239+
if (
2240+
"Version of given Stata file is" in msg
2241+
or "not a Stata dataset" in msg
2242+
or "not a valid Stata" in msg
2243+
):
2244+
base = (
2245+
"This is not a valid Stata dataset. This may be because it is not a "
2246+
"valid Stata dataset, or a Stata dataset from a version of Stata that "
2247+
"pandas cannot import. pandas supports importing versions 105, 108, "
2248+
"111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), 115 (Stata 12), "
2249+
"117 (Stata 13), 118 (Stata 14/15/16), and 119 (Stata 15/16, over 32, "
2250+
"767 variables)."
2251+
)
2252+
hint = ""
2253+
if isinstance(filepath_or_buffer, (str, os.PathLike)):
2254+
s = os.fspath(filepath_or_buffer)
2255+
if "github.com" in s and ("/blob/" in s or "/tree/" in s):
2256+
hint = (
2257+
" If you're loading from GitHub, use the Raw file URL "
2258+
"(replace '/blob/' with '/raw/' or click the 'Raw' button)."
2259+
)
2260+
raise ValueError(base + hint) from e
2261+
# Different error: keep original
2262+
raise
22342263

22352264
if iterator or chunksize:
22362265
return reader

pandas/tests/io/test_stata.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
Series,
2525
)
2626

27+
from pandas.io import stata as stata_mod
2728
from pandas.io.parsers import read_csv
2829
from pandas.io.stata import (
2930
CategoricalConversionWarning,
@@ -2620,3 +2621,26 @@ def test_ascii_error(temp_file, version):
26202621
df.to_stata(temp_file, write_index=0, version=version)
26212622
df_input = read_stata(temp_file)
26222623
tm.assert_frame_equal(df, df_input)
2624+
2625+
2626+
class _BoomReader:
2627+
def __init__(self, *a, **k):
2628+
raise ValueError("Version of given Stata file is 10.")
2629+
2630+
2631+
def test_non_stata_gives_clear_message(monkeypatch, tmp_path):
2632+
monkeypatch.setattr(stata_mod, "StataReader", _BoomReader)
2633+
with pytest.raises(ValueError, match=r"not a valid Stata dataset"):
2634+
read_stata(tmp_path / "not_stata.dta")
2635+
2636+
2637+
def test_github_blob_hint_is_appended(monkeypatch):
2638+
monkeypatch.setattr(stata_mod, "StataReader", _BoomReader)
2639+
with pytest.raises(ValueError, match=r"Raw file URL"):
2640+
read_stata("https://github.com/user/repo/blob/main/file.dta")
2641+
2642+
2643+
def test_github_tree_hint_is_appended(monkeypatch):
2644+
monkeypatch.setattr(stata_mod, "StataReader", _BoomReader)
2645+
with pytest.raises(ValueError, match=r"Raw file URL"):
2646+
read_stata("https://github.com/user/repo/tree/main/data")

0 commit comments

Comments
 (0)