Yikun commented on PR #36083:
URL: https://github.com/apache/spark/pull/36083#issuecomment-1101966359
I guess we might want to consider below ideas:
1. Generate these automically and more accurate. See end script it's a demo
to show how to find diff in Series, I believe we can easy to expanda this
script to all objects. (from results we can see we lack some diff in current
PR.)
```python
import pandas as pd
import pyspark.pandas as ps
from inspect import getmembers, isfunction
pds = pd.Series
pds_funcs = dict([m for m in getmembers(pds, isfunction) if not
m[0].startswith("_")])
pss = ps.Series
pss_funcs = dict([m for m in getmembers(pss, isfunction) if not
m[0].startswith("_")])
# See what function in pandas but not in spark
pds_funcs.keys() - pss_funcs.keys()
supported_func = pds_funcs.keys() & pss_funcs.keys()
for name in supported_func:
pds_func = pds_funcs[name]
pss_func = pss_funcs[name]
# See what func parameter supported in pandas but not supported in
pyspark.pandas
diff = set(inspect.signature(pds_funcs[name]).parameters) -
set(inspect.signature(pss_funcs[name]).parameters)
print(name, diff)
```
<details>
<summary>Results</summary>
Unsuported method (N):
```
['asfreq', 'combine', 'convert_dtypes', 'infer_objects', 'info',
'memory_usage', 'ravel', 'reorder_levels', 'resample', 'searchsorted',
'set_axis', 'set_flags', 'slice_shift', 'to_hdf', 'to_period', 'to_pickle',
'to_sql', 'to_timestamp', 'to_xarray', 'tshift', 'tz_convert', 'tz_localize',
'view']
```
Unsupported parameters (P):
```
abs []
add ['axis', 'fill_value', 'level']
add_prefix []
add_suffix []
agg ['args', 'axis', 'kwargs']
aggregate ['args', 'axis', 'kwargs']
align ['broadcast_axis', 'fill_axis', 'fill_value', 'level', 'limit',
'method']
all ['bool_only', 'kwargs', 'level']
any ['bool_only', 'kwargs', 'level', 'skipna']
append []
apply ['convert_dtype', 'kwargs']
argmax ['args', 'axis', 'kwargs', 'skipna']
argmin ['args', 'axis', 'kwargs', 'skipna']
argsort ['axis', 'kind', 'order']
asof ['subset']
astype ['copy', 'errors']
at_time []
autocorr ['lag']
backfill ['downcast']
between []
between_time ['inclusive']
bfill ['downcast']
bool []
clip ['args', 'axis', 'kwargs']
combine_first []
compare ['align_axis']
copy []
corr ['min_periods']
count ['level']
cov ['ddof']
cummax ['args', 'axis', 'kwargs']
cummin ['args', 'axis', 'kwargs']
cumprod ['args', 'axis', 'kwargs']
cumsum ['args', 'axis', 'kwargs']
describe ['datetime_is_numeric', 'exclude', 'include']
diff []
div ['axis', 'fill_value', 'level']
divide ['axis', 'fill_value', 'level']
divmod ['axis', 'fill_value', 'level']
dot []
drop ['axis', 'columns', 'errors', 'inplace']
drop_duplicates []
droplevel ['axis']
dropna ['how']
duplicated []
eq ['axis', 'fill_value', 'level']
equals []
ewm ['adjust', 'axis', 'ignore_na', 'method', 'times']
expanding ['axis', 'center', 'method']
explode ['ignore_index']
factorize []
ffill ['downcast']
fillna ['downcast']
filter []
first []
first_valid_index []
floordiv ['axis', 'fill_value', 'level']
ge ['axis', 'fill_value', 'level']
get []
groupby ['group_keys', 'level', 'observed', 'sort', 'squeeze']
gt ['axis', 'fill_value', 'level']
head []
hist ['ax', 'backend', 'by', 'figsize', 'grid', 'kwargs', 'legend',
'xlabelsize', 'xrot', 'ylabelsize', 'yrot']
idxmax ['args', 'axis', 'kwargs']
idxmin ['args', 'axis', 'kwargs']
interpolate ['axis', 'downcast', 'inplace', 'kwargs', 'limit_area',
'limit_direction']
isin []
isna []
isnull []
item []
items []
iteritems []
keys []
kurt ['kwargs', 'level', 'skipna']
kurtosis ['kwargs', 'level', 'skipna']
last []
last_valid_index []
le ['axis', 'fill_value', 'level']
lt ['axis', 'fill_value', 'level']
mad ['axis', 'level', 'skipna']
map []
mask ['axis', 'errors', 'inplace', 'level', 'try_cast']
max ['kwargs', 'level', 'skipna']
mean ['kwargs', 'level', 'skipna']
median ['kwargs', 'level', 'skipna']
min ['kwargs', 'level', 'skipna']
mod ['axis', 'fill_value', 'level']
mode []
mul ['axis', 'fill_value', 'level']
multiply ['axis', 'fill_value', 'level']
ne ['axis', 'fill_value', 'level']
nlargest ['keep']
notna []
notnull []
nsmallest ['keep']
nunique []
pad ['downcast']
pct_change ['fill_method', 'freq', 'kwargs', 'limit']
pipe []
pop []
pow ['axis', 'fill_value', 'level']
prod ['kwargs', 'level', 'skipna']
product ['kwargs', 'level', 'skipna']
quantile ['interpolation']
radd ['axis', 'fill_value', 'level']
rank ['axis', 'na_option', 'pct']
rdiv ['axis', 'fill_value', 'level']
rdivmod ['axis', 'fill_value', 'level']
reindex ['args', 'kwargs']
reindex_like ['copy', 'limit', 'method', 'tolerance']
rename ['axis', 'copy', 'errors', 'inplace', 'level']
rename_axis ['axis', 'columns', 'copy']
repeat ['axis']
replace ['inplace', 'limit', 'method']
reset_index []
rfloordiv ['axis', 'fill_value', 'level']
rmod ['axis', 'fill_value', 'level']
rmul ['axis', 'fill_value', 'level']
rolling ['axis', 'center', 'closed', 'method', 'on', 'win_type']
round ['args', 'kwargs']
rpow ['axis', 'fill_value', 'level']
rsub ['axis', 'fill_value', 'level']
rtruediv ['axis', 'fill_value', 'level']
sample ['axis', 'ignore_index', 'weights']
sem ['kwargs', 'level', 'skipna']
shift ['axis', 'freq']
skew ['kwargs', 'level', 'skipna']
sort_index ['ignore_index', 'key', 'sort_remaining']
sort_values ['axis', 'ignore_index', 'key', 'kind']
squeeze []
std ['kwargs', 'level', 'skipna']
sub ['axis', 'fill_value', 'level']
subtract ['axis', 'fill_value', 'level']
sum ['kwargs', 'level', 'skipna']
swapaxes ['axis1', 'axis2']
swaplevel []
tail []
take ['axis', 'is_copy', 'kwargs']
to_clipboard []
to_csv ['chunksize', 'compression', 'decimal', 'doublequote', 'encoding',
'errors', 'float_format', 'index', 'index_label', 'line_terminator',
'path_or_buf', 'quoting', 'storage_options']
to_dict []
to_excel ['storage_options']
to_frame []
to_json ['date_format', 'date_unit', 'default_handler', 'double_precision',
'force_ascii', 'indent', 'index', 'path_or_buf', 'storage_options']
to_latex ['caption', 'label', 'position']
to_list []
to_markdown ['index', 'kwargs', 'storage_options']
to_numpy ['copy', 'dtype', 'kwargs', 'na_value']
to_string ['min_rows']
tolist []
transform []
transpose []
truediv ['axis', 'fill_value', 'level']
truncate []
unique []
unstack ['fill_value']
update []
value_counts []
var ['kwargs', 'level', 'skipna']
where ['axis', 'errors', 'inplace', 'level', 'try_cast']
xs ['axis', 'drop_level']
```
</details>
2. Should we also mentioned specific panda version? After this, we could
easy to see diff between pandas and pyspark.pandas.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]