mirror of https://github.com/apache/flink.git
You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
343 lines
17 KiB
Python
343 lines
17 KiB
Python
################################################################################
|
|
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
################################################################################
|
|
|
|
"""
|
|
Entry point classes of Flink DataStream API:
|
|
|
|
- :class:`StreamExecutionEnvironment`:
|
|
The context in which a streaming program is executed.
|
|
- :class:`DataStream`:
|
|
Represents a stream of elements of the same type. A DataStream can be transformed
|
|
into another DataStream by applying a transformation.
|
|
- :class:`KeyedStream`:
|
|
Represents a :class:`DataStream` where elements are partitioned by key using a
|
|
provided KeySelector.
|
|
- :class:`WindowedStream`:
|
|
Represents a data stream where elements are grouped by key, and for each
|
|
key, the stream of elements is split into windows based on a WindowAssigner. Window emission
|
|
is triggered based on a Trigger.
|
|
- :class:`ConnectedStreams`:
|
|
Represent two connected streams of (possibly) different data types. Connected
|
|
streams are useful for cases where operations on one stream directly affect the operations on
|
|
the other stream, usually via shared state between the streams.
|
|
- :class:`BroadcastStream`:
|
|
Represent a stream with :class:`state.BroadcastState` (s).
|
|
- :class:`BroadcastConnectedStream`:
|
|
Represents the result of connecting a keyed or non-keyed stream, with a
|
|
:class:`BroadcastStream` with :class:`state.BroadcastState` (s)
|
|
|
|
Functions used to transform a :class:`DataStream` into another :class:`DataStream`:
|
|
|
|
- :class:`MapFunction`:
|
|
Performs a map transformation of a :class:`DataStream` at element wise.
|
|
- :class:`CoMapFunction`:
|
|
Performs a map transformation over two connected streams.
|
|
- :class:`FlatMapFunction`:
|
|
Performs a flatmap transformation of a :class:`DataStream` which produces zero, one, or more
|
|
elements for each input element.
|
|
- :class:`CoFlatMapFunction`:
|
|
Performs a flatmap transformation over two connected streams.
|
|
- :class:`FilterFunction`:
|
|
A filter function is a predicate applied individually to each record.
|
|
- :class:`ReduceFunction`:
|
|
Combines groups of elements to a single value.
|
|
- :class:`ProcessFunction`:
|
|
Similar to :class:`FlatMapFunction`, except that it could access the current timestamp and
|
|
watermark in :class:`ProcessFunction`.
|
|
- :class:`KeyedProcessFunction`:
|
|
Similar to :class:`ProcessFunction`, except that it was applied to a :class:`KeyedStream` and
|
|
could register event-time and processing-time timers.
|
|
- :class:`CoProcessFunction`:
|
|
Similar to :class:`CoFlatMapFunction`, except that it could access the current timestamp and
|
|
watermark in :class:`CoProcessFunction`.
|
|
- :class:`KeyedCoProcessFunction`:
|
|
Similar to :class:`CoProcessFunction`, except that it was applied to a keyed
|
|
:class:`ConnectedStreams` and could register event-time and processing-time timers.
|
|
- :class:`WindowFunction`:
|
|
Base interface for functions that are evaluated over keyed (grouped) windows.
|
|
- :class:`ProcessWindowFunction`:
|
|
Similar to :class:`WindowFunction`, except that it could access a context for retrieving extra
|
|
information such as the current timestamp, the watermark, etc.
|
|
- :class:`AggregateFunction`:
|
|
Base class for a user-defined aggregate function.
|
|
- :class:`BroadcastProcessFunction`:
|
|
A function to be applied to a :class:`BroadcastConnectedStream` that connects
|
|
:class:`BroadcastStream`, i.e. a stream with broadcast state, with a non-keyed
|
|
:class:`DataStream`.
|
|
- :class:`KeyedBroadcastProcessFunction`:
|
|
A function to be applied to a :class:`BroadcastConnectedStream` that connects
|
|
:class:`BroadcastStream`, i.e. a stream with broadcast state, with a :class:`KeyedStream`.
|
|
- :class:`RuntimeContext`:
|
|
Contains information about the context in which functions are executed. Each
|
|
parallel instance of the function will have a context through which it can access static
|
|
contextual information (such as the current parallelism), etc.
|
|
|
|
Classes to define window:
|
|
|
|
- :class:`Window`:
|
|
A grouping of elements into finite buckets.
|
|
- :class:`TimeWindow`:
|
|
A grouping of elements according to a time interval from start (inclusive) to end (exclusive).
|
|
- :class:`CountWindow`:
|
|
A grouping of elements according to element count from start (inclusive) to end (exclusive).
|
|
- :class:`GlobalWindow`:
|
|
The window into which all data is placed.
|
|
- :class:`WindowAssigner`:
|
|
Assigns zero or more :class:`Window` to an element.
|
|
- :class:`MergingWindowAssigner`:
|
|
A :class:`WindowAssigner` that can merge windows.
|
|
- :class:`TriggerResult`:
|
|
Result type for trigger methods. This determines what happens with the window, for example
|
|
whether the window function should be called, or the window should be discarded.
|
|
- :class:`Trigger`:
|
|
Determines when a pane of a window should be evaluated to emit the results for that
|
|
part of the window.
|
|
|
|
Classes to define the behavior of checkpoint and state backend:
|
|
|
|
- :class:`CheckpointingMode`:
|
|
Defines what consistency guarantees the system gives in the presence of failures.
|
|
- :class:`CheckpointConfig`:
|
|
Configuration that captures all checkpointing related settings.
|
|
- :class:`StateBackend`:
|
|
Base class of the state backends which define how the state of a streaming application is
|
|
stored locally within the cluster. Different state backends store their state in different
|
|
fashions, and use different data structures to hold the state of a running application.
|
|
- :class:`HashMapStateBackend`:
|
|
Holds the working state in the memory (JVM heap) of the TaskManagers and
|
|
checkpoints based on the configured :class:`CheckpointStorage`.
|
|
- :class:`EmbeddedRocksDBStateBackend`:
|
|
Stores its state in an embedded `RocksDB` instance. This state backend can store very large
|
|
state that exceeds memory and spills to local disk.
|
|
- :class:`CustomStateBackend`:
|
|
A wrapper of customized java state backend.
|
|
- :class:`JobManagerCheckpointStorage`:
|
|
Checkpoints state directly to the JobManager's memory (hence the name), but savepoints will
|
|
be persisted to a file system.
|
|
- :class:`FileSystemCheckpointStorage`:
|
|
Checkpoints state as files to a file system. Each checkpoint individually will store all its
|
|
files in a subdirectory that includes the checkpoint number, such as
|
|
`hdfs://namenode:port/flink-checkpoints/chk-17/`.
|
|
- :class:`CustomCheckpointStorage`:
|
|
A wrapper of customized java checkpoint storage.
|
|
|
|
Classes for state operations:
|
|
|
|
- :class:`state.ValueState`:
|
|
Interface for partitioned single-value state. The value can be retrieved or updated.
|
|
- :class:`state.ListState`:
|
|
Interface for partitioned list state in Operations. The state is accessed and modified by
|
|
user functions, and checkpointed consistently by the system as part of the distributed
|
|
snapshots.
|
|
- :class:`state.MapState`:
|
|
Interface for partitioned key-value state. The key-value pair can be added, updated and
|
|
retrieved.
|
|
- :class:`state.ReducingState`:
|
|
Interface for reducing state. Elements can be added to the state, they will be combined using
|
|
a :class:`ReduceFunction`. The current state can be inspected.
|
|
- :class:`state.AggregatingState`:
|
|
Interface for aggregating state, based on an :class:`AggregateFunction`. Elements that are
|
|
added to this type of state will be eagerly pre-aggregated using a given AggregateFunction.
|
|
- :class:`state.BroadcastState`:
|
|
A type of state that can be created to store the state of a :class:`BroadcastStream`. This
|
|
state assumes that the same elements are sent to all instances of an operator.
|
|
- :class:`state.ReadOnlyBroadcastState`:
|
|
A read-only view of the :class:`state.BroadcastState`.
|
|
- :class:`state.StateTtlConfig`:
|
|
Configuration of state TTL logic.
|
|
|
|
Classes to define source & sink:
|
|
|
|
- :class:`connectors.elasticsearch.ElasticsearchSink`:
|
|
A sink for publishing data into Elasticsearch 6 or Elasticsearch 7.
|
|
- :class:`connectors.kafka.FlinkKafkaConsumer`:
|
|
A streaming data source that pulls a parallel data stream from Apache Kafka.
|
|
- :class:`connectors.kafka.FlinkKafkaProducer`:
|
|
A streaming data sink to produce data into a Kafka topic.
|
|
- :class:`connectors.kafka.KafkaSource`:
|
|
The new API to read data in parallel from Apache Kafka.
|
|
- :class:`connectors.kafka.KafkaSink`:
|
|
The new API to write data into to Apache Kafka topics.
|
|
- :class:`connectors.file_system.FileSource`:
|
|
A unified data source that reads files - both in batch and in streaming mode.
|
|
This source supports all (distributed) file systems and object stores that can be accessed via
|
|
the Flink's FileSystem class.
|
|
- :class:`connectors.file_system.FileSink`:
|
|
A unified sink that emits its input elements to FileSystem files within buckets. This
|
|
sink achieves exactly-once semantics for both BATCH and STREAMING.
|
|
- :class:`connectors.file_system.StreamingFileSink`:
|
|
Sink that emits its input elements to files within buckets. This is integrated with the
|
|
checkpointing mechanism to provide exactly once semantics.
|
|
- :class:`connectors.number_seq.NumberSequenceSource`:
|
|
A data source that produces a sequence of numbers (longs). This source is useful for testing
|
|
and for cases that just need a stream of N events of any kind.
|
|
- :class:`connectors.jdbc.JdbcSink`:
|
|
A data sink to produce data into an external storage using JDBC.
|
|
- :class:`connectors.pulsar.PulsarSource`:
|
|
A streaming data source that pulls a parallel data stream from Pulsar.
|
|
- :class:`connectors.pulsar.PulsarSink`:
|
|
A streaming data sink to produce data into Pulsar.
|
|
- :class:`connectors.rabbitmq.RMQSource`:
|
|
A streaming data source that pulls a parallel data stream from RabbitMQ.
|
|
- :class:`connectors.rabbitmq.RMQSink`:
|
|
A Sink for publishing data into RabbitMQ.
|
|
- :class:`connectors.cassandra.CassandraSink`:
|
|
A Sink for publishing data into Cassandra.
|
|
- :class:`connectors.kinesis.FlinkKinesisConsumer`:
|
|
A streaming data source that pulls a parallel data stream from Kinesis.
|
|
- :class:`connectors.kinesis.KinesisStreamsSink`:
|
|
A Kinesis Data Streams (KDS) Sink that performs async requests against a destination stream
|
|
using the buffering protocol.
|
|
- :class:`connectors.kinesis.KinesisFirehoseSink`:
|
|
A Kinesis Data Firehose (KDF) Sink that performs async requests against a destination delivery
|
|
stream using the buffering protocol.
|
|
- :class:`connectors.hybrid_source.HybridSource`:
|
|
A Hybrid source that switches underlying sources based on configured source chain.
|
|
|
|
|
|
Classes to define formats used together with source & sink:
|
|
|
|
- :class:`formats.csv.CsvReaderFormat`:
|
|
A :class:`~connectors.file_system.StreamFormat` to read CSV files into Row data.
|
|
- :class:`formats.csv.CsvBulkWriter`:
|
|
Creates :class:`~pyflink.common.serialization.BulkWriterFactory` to write Row data into CSV
|
|
files.
|
|
- :class:`formats.avro.GenericRecordAvroTypeInfo`:
|
|
A :class:`~pyflink.common.typeinfo.TypeInformation` to indicate vanilla Python records will be
|
|
translated to GenericRecordAvroTypeInfo on the Java side.
|
|
- :class:`formats.avro.AvroInputFormat`:
|
|
An InputFormat to read avro files in a streaming fashion.
|
|
- :class:`formats.avro.AvroWriters`:
|
|
A class to provide :class:`~pyflink.common.serialization.BulkWriterFactory` to write vanilla
|
|
Python objects into avro files in a batch fashion.
|
|
- :class:`formats.parquet.ParquetColumnarRowInputFormat`:
|
|
A :class:`~connectors.file_system.BulkFormat` to read columnar parquet files into Row data in
|
|
a batch-processing fashion.
|
|
- :class:`formats.parquet.ParquetBulkWriters`:
|
|
Convenient builder to create a :class:`~pyflink.common.serialization.BulkWriterFactory` that
|
|
writes Rows with a defined RowType into Parquet files in a batch fashion.
|
|
- :class:`formats.parquet.AvroParquetReaders`:
|
|
A convenience builder to create reader format that reads individual Avro records from a
|
|
Parquet stream. Only GenericRecord is supported in PyFlink.
|
|
- :class:`formats.parquet.AvroParquetWriters`:
|
|
Convenience builder to create ParquetWriterFactory instances for Avro types. Only
|
|
GenericRecord is supported in PyFlink.
|
|
- :class:`formats.orc.OrcBulkWriters`:
|
|
Convenient builder to create a :class:`~pyflink.common.serialization.BulkWriterFactory` that
|
|
writes Row records with a defined :class:`RowType` into Orc files.
|
|
|
|
Other important classes:
|
|
|
|
- :class:`TimeDomain`:
|
|
Specifies whether a firing timer is based on event time or processing time.
|
|
- :class:`KeySelector`:
|
|
The extractor takes an object and returns the deterministic key for that object.
|
|
- :class:`Partitioner`:
|
|
Function to implement a custom partition assignment for keys.
|
|
- :class:`SinkFunction`:
|
|
Interface for implementing user defined sink functionality.
|
|
- :class:`SourceFunction`:
|
|
Interface for implementing user defined source functionality.
|
|
- :class:`OutputTag`:
|
|
Tag with a name and type for identifying side output of an operator
|
|
"""
|
|
from pyflink.datastream.checkpoint_config import CheckpointConfig
|
|
from pyflink.datastream.externalized_checkpoint_retention import ExternalizedCheckpointRetention
|
|
from pyflink.datastream.checkpointing_mode import CheckpointingMode
|
|
from pyflink.datastream.data_stream import DataStream, KeyedStream, WindowedStream, \
|
|
ConnectedStreams, DataStreamSink, BroadcastStream, BroadcastConnectedStream
|
|
from pyflink.datastream.execution_mode import RuntimeExecutionMode
|
|
from pyflink.datastream.functions import (MapFunction, CoMapFunction, FlatMapFunction,
|
|
CoFlatMapFunction, ReduceFunction, RuntimeContext,
|
|
KeySelector, FilterFunction, Partitioner, SourceFunction,
|
|
SinkFunction, CoProcessFunction, KeyedProcessFunction,
|
|
KeyedCoProcessFunction, AggregateFunction, WindowFunction,
|
|
ProcessWindowFunction, BroadcastProcessFunction,
|
|
KeyedBroadcastProcessFunction)
|
|
from pyflink.datastream.slot_sharing_group import SlotSharingGroup, MemorySize
|
|
from pyflink.datastream.state_backend import (StateBackend, CustomStateBackend,
|
|
PredefinedOptions, HashMapStateBackend,
|
|
EmbeddedRocksDBStateBackend)
|
|
from pyflink.datastream.checkpoint_storage import (CheckpointStorage, JobManagerCheckpointStorage,
|
|
FileSystemCheckpointStorage,
|
|
CustomCheckpointStorage)
|
|
from pyflink.datastream.stream_execution_environment import StreamExecutionEnvironment
|
|
from pyflink.datastream.time_domain import TimeDomain
|
|
from pyflink.datastream.functions import ProcessFunction
|
|
from pyflink.datastream.timerservice import TimerService
|
|
from pyflink.datastream.window import Window, TimeWindow, CountWindow, WindowAssigner, \
|
|
MergingWindowAssigner, TriggerResult, Trigger, GlobalWindow
|
|
from pyflink.datastream.output_tag import OutputTag
|
|
|
|
__all__ = [
|
|
'StreamExecutionEnvironment',
|
|
'DataStream',
|
|
'KeyedStream',
|
|
'WindowedStream',
|
|
'ConnectedStreams',
|
|
'BroadcastStream',
|
|
'BroadcastConnectedStream',
|
|
'DataStreamSink',
|
|
'MapFunction',
|
|
'CoMapFunction',
|
|
'FlatMapFunction',
|
|
'CoFlatMapFunction',
|
|
'ReduceFunction',
|
|
'FilterFunction',
|
|
'ProcessFunction',
|
|
'KeyedProcessFunction',
|
|
'CoProcessFunction',
|
|
'KeyedCoProcessFunction',
|
|
'WindowFunction',
|
|
'ProcessWindowFunction',
|
|
'AggregateFunction',
|
|
'BroadcastProcessFunction',
|
|
'KeyedBroadcastProcessFunction',
|
|
'RuntimeContext',
|
|
'TimerService',
|
|
'CheckpointingMode',
|
|
'CheckpointConfig',
|
|
'ExternalizedCheckpointRetention',
|
|
'StateBackend',
|
|
'HashMapStateBackend',
|
|
'EmbeddedRocksDBStateBackend',
|
|
'CustomStateBackend',
|
|
'PredefinedOptions',
|
|
'CheckpointStorage',
|
|
'JobManagerCheckpointStorage',
|
|
'FileSystemCheckpointStorage',
|
|
'CustomCheckpointStorage',
|
|
'RuntimeExecutionMode',
|
|
'Window',
|
|
'TimeWindow',
|
|
'CountWindow',
|
|
'GlobalWindow',
|
|
'WindowAssigner',
|
|
'MergingWindowAssigner',
|
|
'TriggerResult',
|
|
'Trigger',
|
|
'TimeDomain',
|
|
'KeySelector',
|
|
'Partitioner',
|
|
'SourceFunction',
|
|
'SinkFunction',
|
|
'SlotSharingGroup',
|
|
'MemorySize',
|
|
'OutputTag'
|
|
]
|