ARROW-439: [Python] Add option in "to_pandas" conversions to yield Categorical from...
[arrow.git] / python / pyarrow / tests / test_convert_pandas.py
1 # -*- coding: utf-8 -*-
2 # Licensed to the Apache Software Foundation (ASF) under one
3 # or more contributor license agreements. See the NOTICE file
4 # distributed with this work for additional information
5 # regarding copyright ownership. The ASF licenses this file
6 # to you under the Apache License, Version 2.0 (the
7 # "License"); you may not use this file except in compliance
8 # with the License. You may obtain a copy of the License at
9 #
10 # http://www.apache.org/licenses/LICENSE-2.0
11 #
12 # Unless required by applicable law or agreed to in writing,
13 # software distributed under the License is distributed on an
14 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 # KIND, either express or implied. See the License for the
16 # specific language governing permissions and limitations
17 # under the License.
18
19 from collections import OrderedDict
20
21 from datetime import datetime, date, time
22 import unittest
23 import decimal
24 import json
25
26 import pytest
27
28 import numpy as np
29
30 import pandas as pd
31 import pandas.util.testing as tm
32
33 from pyarrow.compat import u
34 import pyarrow as pa
35
36 from .pandas_examples import dataframe_with_arrays, dataframe_with_lists
37
38
39 def _alltypes_example(size=100):
40 return pd.DataFrame({
41 'uint8': np.arange(size, dtype=np.uint8),
42 'uint16': np.arange(size, dtype=np.uint16),
43 'uint32': np.arange(size, dtype=np.uint32),
44 'uint64': np.arange(size, dtype=np.uint64),
45 'int8': np.arange(size, dtype=np.int16),
46 'int16': np.arange(size, dtype=np.int16),
47 'int32': np.arange(size, dtype=np.int32),
48 'int64': np.arange(size, dtype=np.int64),
49 'float32': np.arange(size, dtype=np.float32),
50 'float64': np.arange(size, dtype=np.float64),
51 'bool': np.random.randn(size) > 0,
52 # TODO(wesm): Pandas only support ns resolution, Arrow supports s, ms,
53 # us, ns
54 'datetime': np.arange("2016-01-01T00:00:00.001", size,
55 dtype='datetime64[ms]'),
56 'str': [str(x) for x in range(size)],
57 'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None],
58 'empty_str': [''] * size
59 })
60
61
62 class TestPandasConversion(unittest.TestCase):
63
64 def setUp(self):
65 pass
66
67 def tearDown(self):
68 pass
69
70 def _check_pandas_roundtrip(self, df, expected=None, nthreads=1,
71 timestamps_to_ms=False, expected_schema=None,
72 check_dtype=True, schema=None,
73 check_index=False):
74 table = pa.Table.from_pandas(df, timestamps_to_ms=timestamps_to_ms,
75 schema=schema, preserve_index=check_index)
76 result = table.to_pandas(nthreads=nthreads)
77 if expected_schema:
78 assert table.schema.equals(expected_schema)
79 if expected is None:
80 expected = df
81 tm.assert_frame_equal(result, expected, check_dtype=check_dtype)
82
83 def _check_array_roundtrip(self, values, expected=None, mask=None,
84 timestamps_to_ms=False, type=None):
85 arr = pa.Array.from_pandas(values, timestamps_to_ms=timestamps_to_ms,
86 mask=mask, type=type)
87 result = arr.to_pandas()
88
89 values_nulls = pd.isnull(values)
90 if mask is None:
91 assert arr.null_count == values_nulls.sum()
92 else:
93 assert arr.null_count == (mask | values_nulls).sum()
94
95 if mask is None:
96 tm.assert_series_equal(pd.Series(result), pd.Series(values),
97 check_names=False)
98 else:
99 expected = pd.Series(np.ma.masked_array(values, mask=mask))
100 tm.assert_series_equal(pd.Series(result), expected,
101 check_names=False)
102
103 def test_all_none_objects(self):
104 df = pd.DataFrame({'a': [None, None, None]})
105 self._check_pandas_roundtrip(df)
106
107 def test_all_none_category(self):
108 df = pd.DataFrame({'a': [None, None, None]})
109 df['a'] = df['a'].astype('category')
110 self._check_pandas_roundtrip(df)
111
112 def test_non_string_columns(self):
113 df = pd.DataFrame({0: [1, 2, 3]})
114 table = pa.Table.from_pandas(df)
115 assert table.column(0).name == '0'
116
117 def test_float_no_nulls(self):
118 data = {}
119 fields = []
120 dtypes = [('f4', pa.float32()), ('f8', pa.float64())]
121 num_values = 100
122
123 for numpy_dtype, arrow_dtype in dtypes:
124 values = np.random.randn(num_values)
125 data[numpy_dtype] = values.astype(numpy_dtype)
126 fields.append(pa.field(numpy_dtype, arrow_dtype))
127
128 df = pd.DataFrame(data)
129 schema = pa.schema(fields)
130 self._check_pandas_roundtrip(df, expected_schema=schema)
131
132 def test_float_nulls(self):
133 num_values = 100
134
135 null_mask = np.random.randint(0, 10, size=num_values) < 3
136 dtypes = [('f4', pa.float32()), ('f8', pa.float64())]
137 names = ['f4', 'f8']
138 expected_cols = []
139
140 arrays = []
141 fields = []
142 for name, arrow_dtype in dtypes:
143 values = np.random.randn(num_values).astype(name)
144
145 arr = pa.Array.from_pandas(values, null_mask)
146 arrays.append(arr)
147 fields.append(pa.field(name, arrow_dtype))
148 values[null_mask] = np.nan
149
150 expected_cols.append(values)
151
152 ex_frame = pd.DataFrame(dict(zip(names, expected_cols)),
153 columns=names)
154
155 table = pa.Table.from_arrays(arrays, names)
156 assert table.schema.equals(pa.schema(fields))
157 result = table.to_pandas()
158 tm.assert_frame_equal(result, ex_frame)
159
160 def test_float_object_nulls(self):
161 arr = np.array([None, 1.5, np.float64(3.5)] * 5, dtype=object)
162 df = pd.DataFrame({'floats': arr})
163 expected = pd.DataFrame({'floats': pd.to_numeric(arr)})
164 field = pa.field('floats', pa.float64())
165 schema = pa.schema([field])
166 self._check_pandas_roundtrip(df, expected=expected,
167 expected_schema=schema)
168
169 def test_int_object_nulls(self):
170 arr = np.array([None, 1, np.int64(3)] * 5, dtype=object)
171 df = pd.DataFrame({'ints': arr})
172 expected = pd.DataFrame({'ints': pd.to_numeric(arr)})
173 field = pa.field('ints', pa.int64())
174 schema = pa.schema([field])
175 self._check_pandas_roundtrip(df, expected=expected,
176 expected_schema=schema)
177
178 def test_integer_no_nulls(self):
179 data = OrderedDict()
180 fields = []
181
182 numpy_dtypes = [
183 ('i1', pa.int8()), ('i2', pa.int16()),
184 ('i4', pa.int32()), ('i8', pa.int64()),
185 ('u1', pa.uint8()), ('u2', pa.uint16()),
186 ('u4', pa.uint32()), ('u8', pa.uint64()),
187 ('longlong', pa.int64()), ('ulonglong', pa.uint64())
188 ]
189 num_values = 100
190
191 for dtype, arrow_dtype in numpy_dtypes:
192 info = np.iinfo(dtype)
193 values = np.random.randint(max(info.min, np.iinfo(np.int_).min),
194 min(info.max, np.iinfo(np.int_).max),
195 size=num_values)
196 data[dtype] = values.astype(dtype)
197 fields.append(pa.field(dtype, arrow_dtype))
198
199 df = pd.DataFrame(data)
200 schema = pa.schema(fields)
201 self._check_pandas_roundtrip(df, expected_schema=schema)
202
203 def test_integer_with_nulls(self):
204 # pandas requires upcast to float dtype
205
206 int_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8']
207 num_values = 100
208
209 null_mask = np.random.randint(0, 10, size=num_values) < 3
210
211 expected_cols = []
212 arrays = []
213 for name in int_dtypes:
214 values = np.random.randint(0, 100, size=num_values)
215
216 arr = pa.Array.from_pandas(values, null_mask)
217 arrays.append(arr)
218
219 expected = values.astype('f8')
220 expected[null_mask] = np.nan
221
222 expected_cols.append(expected)
223
224 ex_frame = pd.DataFrame(dict(zip(int_dtypes, expected_cols)),
225 columns=int_dtypes)
226
227 table = pa.Table.from_arrays(arrays, int_dtypes)
228 result = table.to_pandas()
229
230 tm.assert_frame_equal(result, ex_frame)
231
232 def test_boolean_no_nulls(self):
233 num_values = 100
234
235 np.random.seed(0)
236
237 df = pd.DataFrame({'bools': np.random.randn(num_values) > 0})
238 field = pa.field('bools', pa.bool_())
239 schema = pa.schema([field])
240 self._check_pandas_roundtrip(df, expected_schema=schema)
241
242 def test_boolean_nulls(self):
243 # pandas requires upcast to object dtype
244 num_values = 100
245 np.random.seed(0)
246
247 mask = np.random.randint(0, 10, size=num_values) < 3
248 values = np.random.randint(0, 10, size=num_values) < 5
249
250 arr = pa.Array.from_pandas(values, mask)
251
252 expected = values.astype(object)
253 expected[mask] = None
254
255 field = pa.field('bools', pa.bool_())
256 schema = pa.schema([field])
257 ex_frame = pd.DataFrame({'bools': expected})
258
259 table = pa.Table.from_arrays([arr], ['bools'])
260 assert table.schema.equals(schema)
261 result = table.to_pandas()
262
263 tm.assert_frame_equal(result, ex_frame)
264
265 def test_boolean_object_nulls(self):
266 arr = np.array([False, None, True] * 100, dtype=object)
267 df = pd.DataFrame({'bools': arr})
268 field = pa.field('bools', pa.bool_())
269 schema = pa.schema([field])
270 self._check_pandas_roundtrip(df, expected_schema=schema)
271
272 def test_unicode(self):
273 repeats = 1000
274 values = [u'foo', None, u'bar', u'maƱana', np.nan]
275 df = pd.DataFrame({'strings': values * repeats})
276 field = pa.field('strings', pa.string())
277 schema = pa.schema([field])
278
279 self._check_pandas_roundtrip(df, expected_schema=schema)
280
281 def test_bytes_to_binary(self):
282 values = [u('qux'), b'foo', None, 'bar', 'qux', np.nan]
283 df = pd.DataFrame({'strings': values})
284
285 table = pa.Table.from_pandas(df)
286 assert table[0].type == pa.binary()
287
288 values2 = [b'qux', b'foo', None, b'bar', b'qux', np.nan]
289 expected = pd.DataFrame({'strings': values2})
290 self._check_pandas_roundtrip(df, expected)
291
292 @pytest.mark.large_memory
293 def test_bytes_exceed_2gb(self):
294 val = 'x' * (1 << 20)
295 df = pd.DataFrame({
296 'strings': np.array([val] * 4000, dtype=object)
297 })
298 arr = pa.Array.from_pandas(df['strings'])
299 assert isinstance(arr, pa.ChunkedArray)
300 assert arr.num_chunks == 2
301 arr = None
302
303 table = pa.Table.from_pandas(df)
304 assert table[0].data.num_chunks == 2
305
306 def test_fixed_size_bytes(self):
307 values = [b'foo', None, b'bar', None, None, b'hey']
308 df = pd.DataFrame({'strings': values})
309 schema = pa.schema([pa.field('strings', pa.binary(3))])
310 table = pa.Table.from_pandas(df, schema=schema)
311 assert table.schema[0].type == schema[0].type
312 assert table.schema[0].name == schema[0].name
313 result = table.to_pandas()
314 tm.assert_frame_equal(result, df)
315
316 def test_fixed_size_bytes_does_not_accept_varying_lengths(self):
317 values = [b'foo', None, b'ba', None, None, b'hey']
318 df = pd.DataFrame({'strings': values})
319 schema = pa.schema([pa.field('strings', pa.binary(3))])
320 with self.assertRaises(pa.ArrowInvalid):
321 pa.Table.from_pandas(df, schema=schema)
322
323 def test_timestamps_notimezone_no_nulls(self):
324 df = pd.DataFrame({
325 'datetime64': np.array([
326 '2007-07-13T01:23:34.123',
327 '2006-01-13T12:34:56.432',
328 '2010-08-13T05:46:57.437'],
329 dtype='datetime64[ms]')
330 })
331 field = pa.field('datetime64', pa.timestamp('ms'))
332 schema = pa.schema([field])
333 self._check_pandas_roundtrip(
334 df,
335 timestamps_to_ms=True,
336 expected_schema=schema,
337 )
338
339 df = pd.DataFrame({
340 'datetime64': np.array([
341 '2007-07-13T01:23:34.123456789',
342 '2006-01-13T12:34:56.432539784',
343 '2010-08-13T05:46:57.437699912'],
344 dtype='datetime64[ns]')
345 })
346 field = pa.field('datetime64', pa.timestamp('ns'))
347 schema = pa.schema([field])
348 self._check_pandas_roundtrip(
349 df,
350 timestamps_to_ms=False,
351 expected_schema=schema,
352 )
353
354 def test_timestamps_to_ms_explicit_schema(self):
355 # ARROW-1328
356 df = pd.DataFrame({'datetime': [datetime(2017, 1, 1)]})
357 pa_type = pa.from_numpy_dtype(df['datetime'].dtype)
358
359 arr = pa.Array.from_pandas(df['datetime'], type=pa_type,
360 timestamps_to_ms=True)
361
362 tm.assert_almost_equal(df['datetime'].values.astype('M8[ms]'),
363 arr.to_pandas())
364
365 def test_timestamps_notimezone_nulls(self):
366 df = pd.DataFrame({
367 'datetime64': np.array([
368 '2007-07-13T01:23:34.123',
369 None,
370 '2010-08-13T05:46:57.437'],
371 dtype='datetime64[ms]')
372 })
373 field = pa.field('datetime64', pa.timestamp('ms'))
374 schema = pa.schema([field])
375 self._check_pandas_roundtrip(
376 df,
377 timestamps_to_ms=True,
378 expected_schema=schema,
379 )
380
381 df = pd.DataFrame({
382 'datetime64': np.array([
383 '2007-07-13T01:23:34.123456789',
384 None,
385 '2010-08-13T05:46:57.437699912'],
386 dtype='datetime64[ns]')
387 })
388 field = pa.field('datetime64', pa.timestamp('ns'))
389 schema = pa.schema([field])
390 self._check_pandas_roundtrip(
391 df,
392 timestamps_to_ms=False,
393 expected_schema=schema,
394 )
395
396 def test_timestamps_with_timezone(self):
397 df = pd.DataFrame({
398 'datetime64': np.array([
399 '2007-07-13T01:23:34.123',
400 '2006-01-13T12:34:56.432',
401 '2010-08-13T05:46:57.437'],
402 dtype='datetime64[ms]')
403 })
404 df['datetime64'] = (df['datetime64'].dt.tz_localize('US/Eastern')
405 .to_frame())
406 self._check_pandas_roundtrip(df, timestamps_to_ms=True)
407
408 # drop-in a null and ns instead of ms
409 df = pd.DataFrame({
410 'datetime64': np.array([
411 '2007-07-13T01:23:34.123456789',
412 None,
413 '2006-01-13T12:34:56.432539784',
414 '2010-08-13T05:46:57.437699912'],
415 dtype='datetime64[ns]')
416 })
417 df['datetime64'] = (df['datetime64'].dt.tz_localize('US/Eastern')
418 .to_frame())
419 self._check_pandas_roundtrip(df, timestamps_to_ms=False)
420
421 def test_date_infer(self):
422 df = pd.DataFrame({
423 'date': [date(2000, 1, 1),
424 None,
425 date(1970, 1, 1),
426 date(2040, 2, 26)]})
427 table = pa.Table.from_pandas(df, preserve_index=False)
428 field = pa.field('date', pa.date32())
429 schema = pa.schema([field])
430 assert table.schema.equals(schema)
431 result = table.to_pandas()
432 expected = df.copy()
433 expected['date'] = pd.to_datetime(df['date'])
434 tm.assert_frame_equal(result, expected)
435
436 def test_date_objects_typed(self):
437 arr = np.array([
438 date(2017, 4, 3),
439 None,
440 date(2017, 4, 4),
441 date(2017, 4, 5)], dtype=object)
442
443 arr_i4 = np.array([17259, -1, 17260, 17261], dtype='int32')
444 arr_i8 = arr_i4.astype('int64') * 86400000
445 mask = np.array([False, True, False, False])
446
447 t32 = pa.date32()
448 t64 = pa.date64()
449
450 a32 = pa.Array.from_pandas(arr, type=t32)
451 a64 = pa.Array.from_pandas(arr, type=t64)
452
453 a32_expected = pa.Array.from_pandas(arr_i4, mask=mask, type=t32)
454 a64_expected = pa.Array.from_pandas(arr_i8, mask=mask, type=t64)
455
456 assert a32.equals(a32_expected)
457 assert a64.equals(a64_expected)
458
459 # Test converting back to pandas
460 colnames = ['date32', 'date64']
461 table = pa.Table.from_arrays([a32, a64], colnames)
462 table_pandas = table.to_pandas()
463
464 ex_values = (np.array(['2017-04-03', '2017-04-04', '2017-04-04',
465 '2017-04-05'],
466 dtype='datetime64[D]')
467 .astype('datetime64[ns]'))
468 ex_values[1] = pd.NaT.value
469 expected_pandas = pd.DataFrame({'date32': ex_values,
470 'date64': ex_values},
471 columns=colnames)
472 tm.assert_frame_equal(table_pandas, expected_pandas)
473
474 def test_dates_from_integers(self):
475 t1 = pa.date32()
476 t2 = pa.date64()
477
478 arr = np.array([17259, 17260, 17261], dtype='int32')
479 arr2 = arr.astype('int64') * 86400000
480
481 a1 = pa.Array.from_pandas(arr, type=t1)
482 a2 = pa.Array.from_pandas(arr2, type=t2)
483
484 expected = date(2017, 4, 3)
485 assert a1[0].as_py() == expected
486 assert a2[0].as_py() == expected
487
488 @pytest.mark.xfail(reason="not supported ATM",
489 raises=NotImplementedError)
490 def test_timedelta(self):
491 # TODO(jreback): Pandas only support ns resolution
492 # Arrow supports ??? for resolution
493 df = pd.DataFrame({
494 'timedelta': np.arange(start=0, stop=3 * 86400000,
495 step=86400000,
496 dtype='timedelta64[ms]')
497 })
498 pa.Table.from_pandas(df)
499
500 def test_column_of_arrays(self):
501 df, schema = dataframe_with_arrays()
502 self._check_pandas_roundtrip(df, schema=schema, expected_schema=schema)
503 table = pa.Table.from_pandas(df, schema=schema, preserve_index=False)
504 assert table.schema.equals(schema)
505
506 for column in df.columns:
507 field = schema.field_by_name(column)
508 self._check_array_roundtrip(df[column], type=field.type)
509
510 def test_column_of_arrays_to_py(self):
511 # Test regression in ARROW-1199 not caught in above test
512 dtype = 'i1'
513 arr = np.array([
514 np.arange(10, dtype=dtype),
515 np.arange(5, dtype=dtype),
516 None,
517 np.arange(1, dtype=dtype)
518 ])
519 type_ = pa.list_(pa.int8())
520 parr = pa.Array.from_pandas(arr, type=type_)
521
522 assert parr[0].as_py() == list(range(10))
523 assert parr[1].as_py() == list(range(5))
524 assert parr[2].as_py() is None
525 assert parr[3].as_py() == [0]
526
527 def test_column_of_lists(self):
528 df, schema = dataframe_with_lists()
529 self._check_pandas_roundtrip(df, schema=schema, expected_schema=schema)
530 table = pa.Table.from_pandas(df, schema=schema, preserve_index=False)
531 assert table.schema.equals(schema)
532
533 for column in df.columns:
534 field = schema.field_by_name(column)
535 self._check_array_roundtrip(df[column], type=field.type)
536
537 def test_nested_lists_all_none(self):
538 data = np.array([[None, None], None], dtype=object)
539
540 arr = pa.Array.from_pandas(data)
541 expected = pa.array(list(data))
542 assert arr.equals(expected)
543 assert arr.type == pa.list_(pa.null())
544
545 data2 = np.array([None, None, [None, None],
546 np.array([None, None], dtype=object)],
547 dtype=object)
548 arr = pa.Array.from_pandas(data2)
549 expected = pa.array([None, None, [None, None], [None, None]])
550 assert arr.equals(expected)
551
552 def test_threaded_conversion(self):
553 df = _alltypes_example()
554 self._check_pandas_roundtrip(df, nthreads=2,
555 timestamps_to_ms=False)
556
557 def test_category(self):
558 repeats = 5
559 v1 = ['foo', None, 'bar', 'qux', np.nan]
560 v2 = [4, 5, 6, 7, 8]
561 v3 = [b'foo', None, b'bar', b'qux', np.nan]
562 df = pd.DataFrame({'cat_strings': pd.Categorical(v1 * repeats),
563 'cat_ints': pd.Categorical(v2 * repeats),
564 'cat_binary': pd.Categorical(v3 * repeats),
565 'cat_strings_ordered': pd.Categorical(
566 v1 * repeats, categories=['bar', 'qux', 'foo'],
567 ordered=True),
568 'ints': v2 * repeats,
569 'ints2': v2 * repeats,
570 'strings': v1 * repeats,
571 'strings2': v1 * repeats,
572 'strings3': v3 * repeats})
573 self._check_pandas_roundtrip(df)
574
575 arrays = [
576 pd.Categorical(v1 * repeats),
577 pd.Categorical(v2 * repeats),
578 pd.Categorical(v3 * repeats)
579 ]
580 for values in arrays:
581 self._check_array_roundtrip(values)
582
583 def test_mixed_types_fails(self):
584 data = pd.DataFrame({'a': ['a', 1, 2.0]})
585 with self.assertRaises(pa.ArrowException):
586 pa.Table.from_pandas(data)
587
588 def test_strided_data_import(self):
589 cases = []
590
591 columns = ['a', 'b', 'c']
592 N, K = 100, 3
593 random_numbers = np.random.randn(N, K).copy() * 100
594
595 numeric_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8',
596 'f4', 'f8']
597
598 for type_name in numeric_dtypes:
599 cases.append(random_numbers.astype(type_name))
600
601 # strings
602 cases.append(np.array([tm.rands(10) for i in range(N * K)],
603 dtype=object)
604 .reshape(N, K).copy())
605
606 # booleans
607 boolean_objects = (np.array([True, False, True] * N, dtype=object)
608 .reshape(N, K).copy())
609
610 # add some nulls, so dtype comes back as objects
611 boolean_objects[5] = None
612 cases.append(boolean_objects)
613
614 cases.append(np.arange("2016-01-01T00:00:00.001", N * K,
615 dtype='datetime64[ms]')
616 .reshape(N, K).copy())
617
618 strided_mask = (random_numbers > 0).astype(bool)[:, 0]
619
620 for case in cases:
621 df = pd.DataFrame(case, columns=columns)
622 col = df['a']
623
624 self._check_pandas_roundtrip(df)
625 self._check_array_roundtrip(col)
626 self._check_array_roundtrip(col, mask=strided_mask)
627
628 def test_decimal_32_from_pandas(self):
629 expected = pd.DataFrame({
630 'decimals': [
631 decimal.Decimal('-1234.123'),
632 decimal.Decimal('1234.439'),
633 ]
634 })
635 converted = pa.Table.from_pandas(expected, preserve_index=False)
636 field = pa.field('decimals', pa.decimal(7, 3))
637 schema = pa.schema([field])
638 assert converted.schema.equals(schema)
639
640 def test_decimal_32_to_pandas(self):
641 expected = pd.DataFrame({
642 'decimals': [
643 decimal.Decimal('-1234.123'),
644 decimal.Decimal('1234.439'),
645 ]
646 })
647 converted = pa.Table.from_pandas(expected)
648 df = converted.to_pandas()
649 tm.assert_frame_equal(df, expected)
650
651 def test_decimal_64_from_pandas(self):
652 expected = pd.DataFrame({
653 'decimals': [
654 decimal.Decimal('-129934.123331'),
655 decimal.Decimal('129534.123731'),
656 ]
657 })
658 converted = pa.Table.from_pandas(expected, preserve_index=False)
659 field = pa.field('decimals', pa.decimal(12, 6))
660 schema = pa.schema([field])
661 assert converted.schema.equals(schema)
662
663 def test_decimal_64_to_pandas(self):
664 expected = pd.DataFrame({
665 'decimals': [
666 decimal.Decimal('-129934.123331'),
667 decimal.Decimal('129534.123731'),
668 ]
669 })
670 converted = pa.Table.from_pandas(expected)
671 df = converted.to_pandas()
672 tm.assert_frame_equal(df, expected)
673
674 def test_decimal_128_from_pandas(self):
675 expected = pd.DataFrame({
676 'decimals': [
677 decimal.Decimal('394092382910493.12341234678'),
678 -decimal.Decimal('314292388910493.12343437128'),
679 ]
680 })
681 converted = pa.Table.from_pandas(expected, preserve_index=False)
682 field = pa.field('decimals', pa.decimal(26, 11))
683 schema = pa.schema([field])
684 assert converted.schema.equals(schema)
685
686 def test_decimal_128_to_pandas(self):
687 expected = pd.DataFrame({
688 'decimals': [
689 decimal.Decimal('394092382910493.12341234678'),
690 -decimal.Decimal('314292388910493.12343437128'),
691 ]
692 })
693 converted = pa.Table.from_pandas(expected)
694 df = converted.to_pandas()
695 tm.assert_frame_equal(df, expected)
696
697 def test_pytime_from_pandas(self):
698 pytimes = [time(1, 2, 3, 1356),
699 time(4, 5, 6, 1356)]
700
701 # microseconds
702 t1 = pa.time64('us')
703
704 aobjs = np.array(pytimes + [None], dtype=object)
705 parr = pa.Array.from_pandas(aobjs)
706 assert parr.type == t1
707 assert parr[0].as_py() == pytimes[0]
708 assert parr[1].as_py() == pytimes[1]
709 assert parr[2] is pa.NA
710
711 # DataFrame
712 df = pd.DataFrame({'times': aobjs})
713 batch = pa.RecordBatch.from_pandas(df)
714 assert batch[0].equals(parr)
715
716 # Test ndarray of int64 values
717 arr = np.array([_pytime_to_micros(v) for v in pytimes],
718 dtype='int64')
719
720 a1 = pa.Array.from_pandas(arr, type=pa.time64('us'))
721 assert a1[0].as_py() == pytimes[0]
722
723 a2 = pa.Array.from_pandas(arr * 1000, type=pa.time64('ns'))
724 assert a2[0].as_py() == pytimes[0]
725
726 a3 = pa.Array.from_pandas((arr / 1000).astype('i4'),
727 type=pa.time32('ms'))
728 assert a3[0].as_py() == pytimes[0].replace(microsecond=1000)
729
730 a4 = pa.Array.from_pandas((arr / 1000000).astype('i4'),
731 type=pa.time32('s'))
732 assert a4[0].as_py() == pytimes[0].replace(microsecond=0)
733
734 def test_arrow_time_to_pandas(self):
735 pytimes = [time(1, 2, 3, 1356),
736 time(4, 5, 6, 1356),
737 time(0, 0, 0)]
738
739 expected = np.array(pytimes[:2] + [None])
740 expected_ms = np.array([x.replace(microsecond=1000)
741 for x in pytimes[:2]] +
742 [None])
743 expected_s = np.array([x.replace(microsecond=0)
744 for x in pytimes[:2]] +
745 [None])
746
747 arr = np.array([_pytime_to_micros(v) for v in pytimes],
748 dtype='int64')
749 arr = np.array([_pytime_to_micros(v) for v in pytimes],
750 dtype='int64')
751
752 null_mask = np.array([False, False, True], dtype=bool)
753
754 a1 = pa.Array.from_pandas(arr, mask=null_mask, type=pa.time64('us'))
755 a2 = pa.Array.from_pandas(arr * 1000, mask=null_mask,
756 type=pa.time64('ns'))
757
758 a3 = pa.Array.from_pandas((arr / 1000).astype('i4'), mask=null_mask,
759 type=pa.time32('ms'))
760 a4 = pa.Array.from_pandas((arr / 1000000).astype('i4'), mask=null_mask,
761 type=pa.time32('s'))
762
763 names = ['time64[us]', 'time64[ns]', 'time32[ms]', 'time32[s]']
764 batch = pa.RecordBatch.from_arrays([a1, a2, a3, a4], names)
765 arr = a1.to_pandas()
766 assert (arr == expected).all()
767
768 arr = a2.to_pandas()
769 assert (arr == expected).all()
770
771 arr = a3.to_pandas()
772 assert (arr == expected_ms).all()
773
774 arr = a4.to_pandas()
775 assert (arr == expected_s).all()
776
777 df = batch.to_pandas()
778 expected_df = pd.DataFrame({'time64[us]': expected,
779 'time64[ns]': expected,
780 'time32[ms]': expected_ms,
781 'time32[s]': expected_s},
782 columns=names)
783
784 tm.assert_frame_equal(df, expected_df)
785
786 def test_all_nones(self):
787 def _check_series(s):
788 converted = pa.Array.from_pandas(s)
789 assert isinstance(converted, pa.NullArray)
790 assert len(converted) == 3
791 assert converted.null_count == 3
792 assert converted[0] is pa.NA
793
794 _check_series(pd.Series([None] * 3, dtype=object))
795 _check_series(pd.Series([np.nan] * 3, dtype=object))
796 _check_series(pd.Series([np.sqrt(-1)] * 3, dtype=object))
797
798 def test_multiindex_duplicate_values(self):
799 num_rows = 3
800 numbers = list(range(num_rows))
801 index = pd.MultiIndex.from_arrays(
802 [['foo', 'foo', 'bar'], numbers],
803 names=['foobar', 'some_numbers'],
804 )
805
806 df = pd.DataFrame({'numbers': numbers}, index=index)
807
808 table = pa.Table.from_pandas(df)
809 result_df = table.to_pandas()
810 tm.assert_frame_equal(result_df, df)
811
812 def test_partial_schema(self):
813 data = OrderedDict([
814 ('a', [0, 1, 2, 3, 4]),
815 ('b', np.array([-10, -5, 0, 5, 10], dtype=np.int32)),
816 ('c', [-10, -5, 0, 5, 10])
817 ])
818 df = pd.DataFrame(data)
819
820 partial_schema = pa.schema([
821 pa.field('a', pa.int64()),
822 pa.field('b', pa.int32())
823 ])
824
825 expected_schema = pa.schema([
826 pa.field('a', pa.int64()),
827 pa.field('b', pa.int32()),
828 pa.field('c', pa.int64())
829 ])
830
831 self._check_pandas_roundtrip(df, schema=partial_schema,
832 expected_schema=expected_schema)
833
834 def test_structarray(self):
835 ints = pa.array([None, 2, 3], type=pa.int64())
836 strs = pa.array([u'a', None, u'c'], type=pa.string())
837 bools = pa.array([True, False, None], type=pa.bool_())
838 arr = pa.StructArray.from_arrays(
839 ['ints', 'strs', 'bools'],
840 [ints, strs, bools])
841
842 expected = pd.Series([
843 {'ints': None, 'strs': u'a', 'bools': True},
844 {'ints': 2, 'strs': None, 'bools': False},
845 {'ints': 3, 'strs': u'c', 'bools': None},
846 ])
847
848 series = pd.Series(arr.to_pandas())
849 tm.assert_series_equal(series, expected)
850
851 def test_infer_lists(self):
852 data = OrderedDict([
853 ('nan_ints', [[None, 1], [2, 3]]),
854 ('ints', [[0, 1], [2, 3]]),
855 ('strs', [[None, u'b'], [u'c', u'd']]),
856 ('nested_strs', [[[None, u'b'], [u'c', u'd']], None])
857 ])
858 df = pd.DataFrame(data)
859
860 expected_schema = pa.schema([
861 pa.field('nan_ints', pa.list_(pa.int64())),
862 pa.field('ints', pa.list_(pa.int64())),
863 pa.field('strs', pa.list_(pa.string())),
864 pa.field('nested_strs', pa.list_(pa.list_(pa.string())))
865 ])
866
867 self._check_pandas_roundtrip(df, expected_schema=expected_schema)
868
869 def test_infer_numpy_array(self):
870 data = OrderedDict([
871 ('ints', [
872 np.array([0, 1], dtype=np.int64),
873 np.array([2, 3], dtype=np.int64)
874 ])
875 ])
876 df = pd.DataFrame(data)
877 expected_schema = pa.schema([
878 pa.field('ints', pa.list_(pa.int64()))
879 ])
880
881 self._check_pandas_roundtrip(df, expected_schema=expected_schema)
882
883 def test_metadata_with_mixed_types(self):
884 df = pd.DataFrame({'data': [b'some_bytes', u'some_unicode']})
885 table = pa.Table.from_pandas(df)
886 metadata = table.schema.metadata
887 assert b'mixed' not in metadata[b'pandas']
888
889 js = json.loads(metadata[b'pandas'].decode('utf8'))
890 data_column = js['columns'][0]
891 assert data_column['pandas_type'] == 'bytes'
892 assert data_column['numpy_type'] == 'object'
893
894 def test_list_metadata(self):
895 df = pd.DataFrame({'data': [[1], [2, 3, 4], [5] * 7]})
896 schema = pa.schema([pa.field('data', type=pa.list_(pa.int64()))])
897 table = pa.Table.from_pandas(df, schema=schema)
898 metadata = table.schema.metadata
899 assert b'mixed' not in metadata[b'pandas']
900
901 js = json.loads(metadata[b'pandas'].decode('utf8'))
902 data_column = js['columns'][0]
903 assert data_column['pandas_type'] == 'list[int64]'
904 assert data_column['numpy_type'] == 'object'
905
906 def test_decimal_metadata(self):
907 expected = pd.DataFrame({
908 'decimals': [
909 decimal.Decimal('394092382910493.12341234678'),
910 -decimal.Decimal('314292388910493.12343437128'),
911 ]
912 })
913 table = pa.Table.from_pandas(expected)
914 metadata = table.schema.metadata
915 assert b'mixed' not in metadata[b'pandas']
916
917 js = json.loads(metadata[b'pandas'].decode('utf8'))
918 data_column = js['columns'][0]
919 assert data_column['pandas_type'] == 'decimal'
920 assert data_column['numpy_type'] == 'object'
921 assert data_column['metadata'] == {'precision': 26, 'scale': 11}
922
923 def test_table_str_to_categorical(self):
924 values = [None, 'a', 'b', np.nan]
925 df = pd.DataFrame({'strings': values})
926 field = pa.field('strings', pa.string())
927 schema = pa.schema([field])
928 table = pa.Table.from_pandas(df, schema=schema)
929
930 result = table.to_pandas(strings_to_categorical=True)
931 expected = pd.DataFrame({'strings': pd.Categorical(values)})
932 tm.assert_frame_equal(result, expected, check_dtype=True)
933
934
935 def _pytime_from_micros(val):
936 microseconds = val % 1000000
937 val //= 1000000
938 seconds = val % 60
939 val //= 60
940 minutes = val % 60
941 hours = val // 60
942 return time(hours, minutes, seconds, microseconds)
943
944
945 def _pytime_to_micros(pytime):
946 return (pytime.hour * 3600000000 +
947 pytime.minute * 60000000 +
948 pytime.second * 1000000 +
949 pytime.microsecond)