You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
flink/flink-python/pyflink/table/serializers.py

80 lines
3.0 KiB
Python

################################################################################
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
################################################################################
import io
from pyflink.serializers import IterableSerializer
from pyflink.table.utils import arrow_to_pandas, pandas_to_arrow
class ArrowSerializer(IterableSerializer):
"""
Serializes pandas.Series into Arrow streaming format data.
"""
def __init__(self, schema, row_type, timezone):
super(ArrowSerializer, self).__init__()
self._schema = schema
self._field_types = row_type.field_types()
self._timezone = timezone
def __repr__(self):
return "ArrowSerializer"
def serialize(self, iterable, stream):
writer = None
try:
for cols in iterable:
batch = pandas_to_arrow(self._schema, self._timezone, self._field_types, cols)
if writer is None:
import pyarrow as pa
writer = pa.RecordBatchStreamWriter(stream, batch.schema)
writer.write_batch(batch)
finally:
if writer is not None:
writer.close()
def deserialize(self, stream):
import pyarrow as pa
reader = pa.ipc.open_stream(stream)
for batch in reader:
yield arrow_to_pandas(self._timezone, self._field_types, [batch])
def load_from_iterator(self, iter):
class IteratorIO(io.RawIOBase):
def __init__(self, iter):
super(IteratorIO, self).__init__()
self.iter = iter
self.leftover = None
def readable(self):
return True
def readinto(self, b):
output_buffer_len = len(b)
input = self.leftover or (self.iter.next() if self.iter.hasNext() else None)
if input is None:
return 0
output, self.leftover = input[:output_buffer_len], input[output_buffer_len:]
b[:len(output)] = output
return len(output)
import pyarrow as pa
reader = pa.ipc.open_stream(
io.BufferedReader(IteratorIO(iter), buffer_size=io.DEFAULT_BUFFER_SIZE))
for batch in reader:
yield batch