#!/usr/bin/env python # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import contextlib import glob import os import os.path from os.path import join as pjoin import re import shlex import shutil import sys if sys.version_info >= (3, 10): import sysconfig else: # Get correct EXT_SUFFIX on Windows (https://bugs.python.org/issue39825) from distutils import sysconfig import pkg_resources from setuptools import setup, Extension, Distribution from Cython.Distutils import build_ext as _build_ext import Cython # Check if we're running 64-bit Python is_64_bit = sys.maxsize > 2**32 if Cython.__version__ < '0.29': raise Exception('Please upgrade to Cython 0.29 or newer') setup_dir = os.path.abspath(os.path.dirname(__file__)) ext_suffix = sysconfig.get_config_var('EXT_SUFFIX') @contextlib.contextmanager def changed_dir(dirname): oldcwd = os.getcwd() os.chdir(dirname) try: yield finally: os.chdir(oldcwd) def strtobool(val): """Convert a string representation of truth to true (1) or false (0). True values are 'y', 'yes', 't', 'true', 'on', and '1'; false values are 'n', 'no', 'f', 'false', 'off', and '0'. Raises ValueError if 'val' is anything else. """ # Copied from distutils val = val.lower() if val in ('y', 'yes', 't', 'true', 'on', '1'): return 1 elif val in ('n', 'no', 'f', 'false', 'off', '0'): return 0 else: raise ValueError("invalid truth value %r" % (val,)) class build_ext(_build_ext): _found_names = () def build_extensions(self): numpy_incl = pkg_resources.resource_filename('numpy', 'core/include') self.extensions = [ext for ext in self.extensions if ext.name != '__dummy__'] for ext in self.extensions: if (hasattr(ext, 'include_dirs') and numpy_incl not in ext.include_dirs): ext.include_dirs.append(numpy_incl) _build_ext.build_extensions(self) def run(self): self._run_cmake() _build_ext.run(self) # adapted from cmake_build_ext in dynd-python # github.com/libdynd/dynd-python description = "Build the C-extensions for arrow" user_options = ([('cmake-generator=', None, 'CMake generator'), ('extra-cmake-args=', None, 'extra arguments for CMake'), ('build-type=', None, 'build type (debug or release), default release'), ('boost-namespace=', None, 'namespace of boost (default: boost)'), ('with-cuda', None, 'build the Cuda extension'), ('with-flight', None, 'build the Flight extension'), ('with-dataset', None, 'build the Dataset extension'), ('with-parquet', None, 'build the Parquet extension'), ('with-s3', None, 'build the Amazon S3 extension'), ('with-static-parquet', None, 'link parquet statically'), ('with-static-boost', None, 'link boost statically'), ('with-plasma', None, 'build the Plasma extension'), ('with-tensorflow', None, 'build pyarrow with TensorFlow support'), ('with-orc', None, 'build the ORC extension'), ('with-gandiva', None, 'build the Gandiva extension'), ('generate-coverage', None, 'enable Cython code coverage'), ('bundle-boost', None, 'bundle the (shared) Boost libraries'), ('bundle-cython-cpp', None, 'bundle generated Cython C++ code ' '(used for code coverage)'), ('bundle-arrow-cpp', None, 'bundle the Arrow C++ libraries'), ('bundle-arrow-cpp-headers', None, 'bundle the Arrow C++ headers'), ('bundle-plasma-executable', None, 'bundle the plasma-store-server executable')] + _build_ext.user_options) def initialize_options(self): _build_ext.initialize_options(self) self.cmake_generator = os.environ.get('PYARROW_CMAKE_GENERATOR') if not self.cmake_generator and sys.platform == 'win32': self.cmake_generator = 'Visual Studio 15 2017 Win64' self.extra_cmake_args = os.environ.get('PYARROW_CMAKE_OPTIONS', '') self.build_type = os.environ.get('PYARROW_BUILD_TYPE', 'release').lower() self.boost_namespace = os.environ.get('PYARROW_BOOST_NAMESPACE', 'boost') self.cmake_cxxflags = os.environ.get('PYARROW_CXXFLAGS', '') if sys.platform == 'win32': # Cannot do debug builds in Windows unless Python itself is a debug # build if not hasattr(sys, 'gettotalrefcount'): self.build_type = 'release' self.with_s3 = strtobool( os.environ.get('PYARROW_WITH_S3', '0')) self.with_hdfs = strtobool( os.environ.get('PYARROW_WITH_HDFS', '0')) self.with_cuda = strtobool( os.environ.get('PYARROW_WITH_CUDA', '0')) self.with_flight = strtobool( os.environ.get('PYARROW_WITH_FLIGHT', '0')) self.with_dataset = strtobool( os.environ.get('PYARROW_WITH_DATASET', '0')) self.with_parquet = strtobool( os.environ.get('PYARROW_WITH_PARQUET', '0')) self.with_static_parquet = strtobool( os.environ.get('PYARROW_WITH_STATIC_PARQUET', '0')) self.with_static_boost = strtobool( os.environ.get('PYARROW_WITH_STATIC_BOOST', '0')) self.with_plasma = strtobool( os.environ.get('PYARROW_WITH_PLASMA', '0')) self.with_tensorflow = strtobool( os.environ.get('PYARROW_WITH_TENSORFLOW', '0')) self.with_orc = strtobool( os.environ.get('PYARROW_WITH_ORC', '0')) self.with_gandiva = strtobool( os.environ.get('PYARROW_WITH_GANDIVA', '0')) self.generate_coverage = strtobool( os.environ.get('PYARROW_GENERATE_COVERAGE', '0')) self.bundle_arrow_cpp = strtobool( os.environ.get('PYARROW_BUNDLE_ARROW_CPP', '0')) self.bundle_cython_cpp = strtobool( os.environ.get('PYARROW_BUNDLE_CYTHON_CPP', '0')) self.bundle_boost = strtobool( os.environ.get('PYARROW_BUNDLE_BOOST', '0')) self.bundle_arrow_cpp_headers = strtobool( os.environ.get('PYARROW_BUNDLE_ARROW_CPP_HEADERS', '1')) self.bundle_plasma_executable = strtobool( os.environ.get('PYARROW_BUNDLE_PLASMA_EXECUTABLE', '1')) CYTHON_MODULE_NAMES = [ 'lib', '_fs', '_csv', '_json', '_compute', '_cuda', '_flight', '_dataset', '_dataset_orc', '_feather', '_parquet', '_orc', '_plasma', '_s3fs', '_hdfs', '_hdfsio', 'gandiva'] def _run_cmake(self): # check if build_type is correctly passed / set if self.build_type.lower() not in ('release', 'debug'): raise ValueError("--build-type (or PYARROW_BUILD_TYPE) needs to " "be 'release' or 'debug'") # The directory containing this setup.py source = os.path.dirname(os.path.abspath(__file__)) # The staging directory for the module being built build_cmd = self.get_finalized_command('build') build_temp = pjoin(os.getcwd(), build_cmd.build_temp) build_lib = pjoin(os.getcwd(), build_cmd.build_lib) saved_cwd = os.getcwd() if not os.path.isdir(build_temp): self.mkpath(build_temp) # Change to the build directory with changed_dir(build_temp): # Detect if we built elsewhere if os.path.isfile('CMakeCache.txt'): cachefile = open('CMakeCache.txt', 'r') cachedir = re.search('CMAKE_CACHEFILE_DIR:INTERNAL=(.*)', cachefile.read()).group(1) cachefile.close() if (cachedir != build_temp): return static_lib_option = '' cmake_options = [ '-DPYTHON_EXECUTABLE=%s' % sys.executable, '-DPython3_EXECUTABLE=%s' % sys.executable, static_lib_option, ] def append_cmake_bool(value, varname): cmake_options.append('-D{0}={1}'.format( varname, 'on' if value else 'off')) if self.cmake_generator: cmake_options += ['-G', self.cmake_generator] append_cmake_bool(self.with_cuda, 'PYARROW_BUILD_CUDA') append_cmake_bool(self.with_flight, 'PYARROW_BUILD_FLIGHT') append_cmake_bool(self.with_gandiva, 'PYARROW_BUILD_GANDIVA') append_cmake_bool(self.with_dataset, 'PYARROW_BUILD_DATASET') append_cmake_bool(self.with_orc, 'PYARROW_BUILD_ORC') append_cmake_bool(self.with_parquet, 'PYARROW_BUILD_PARQUET') append_cmake_bool(self.with_plasma, 'PYARROW_BUILD_PLASMA') append_cmake_bool(self.with_s3, 'PYARROW_BUILD_S3') append_cmake_bool(self.with_hdfs, 'PYARROW_BUILD_HDFS') append_cmake_bool(self.with_tensorflow, 'PYARROW_USE_TENSORFLOW') append_cmake_bool(self.bundle_arrow_cpp, 'PYARROW_BUNDLE_ARROW_CPP') append_cmake_bool(self.bundle_boost, 'PYARROW_BUNDLE_BOOST') append_cmake_bool(self.generate_coverage, 'PYARROW_GENERATE_COVERAGE') append_cmake_bool(not self.with_static_boost, 'PYARROW_BOOST_USE_SHARED') append_cmake_bool(not self.with_static_parquet, 'PYARROW_PARQUET_USE_SHARED') cmake_options.append('-DCMAKE_BUILD_TYPE={0}' .format(self.build_type.lower())) if self.boost_namespace != 'boost': cmake_options.append('-DBoost_NAMESPACE={}' .format(self.boost_namespace)) extra_cmake_args = shlex.split(self.extra_cmake_args) build_tool_args = [] if sys.platform == 'win32': if not is_64_bit: raise RuntimeError('Not supported on 32-bit Windows') else: build_tool_args.append('--') if os.environ.get('PYARROW_BUILD_VERBOSE', '0') == '1': cmake_options.append('-DCMAKE_VERBOSE_MAKEFILE=ON') if os.environ.get('PYARROW_PARALLEL'): build_tool_args.append( '-j{0}'.format(os.environ['PYARROW_PARALLEL'])) # Generate the build files print("-- Running cmake for pyarrow") self.spawn(['cmake'] + extra_cmake_args + cmake_options + [source]) print("-- Finished cmake for pyarrow") print("-- Running cmake --build for pyarrow") self.spawn(['cmake', '--build', '.', '--config', self.build_type] + build_tool_args) print("-- Finished cmake --build for pyarrow") if self.inplace: # a bit hacky build_lib = saved_cwd # Move the libraries to the place expected by the Python build try: os.makedirs(pjoin(build_lib, 'pyarrow')) except OSError: pass if sys.platform == 'win32': build_prefix = '' else: build_prefix = self.build_type if self.bundle_arrow_cpp or self.bundle_arrow_cpp_headers: print('Bundling includes: ' + pjoin(build_prefix, 'include')) if os.path.exists(pjoin(build_lib, 'pyarrow', 'include')): shutil.rmtree(pjoin(build_lib, 'pyarrow', 'include')) shutil.move(pjoin(build_prefix, 'include'), pjoin(build_lib, 'pyarrow')) # Move the built C-extension to the place expected by the Python # build self._found_names = [] for name in self.CYTHON_MODULE_NAMES: built_path = self.get_ext_built(name) if not os.path.exists(built_path): print('Did not find {0}'.format(built_path)) if self._failure_permitted(name): print('Cython module {0} failure permitted' .format(name)) continue raise RuntimeError('pyarrow C-extension failed to build:', os.path.abspath(built_path)) # The destination path to move the built C extension to ext_path = pjoin(build_lib, self._get_cmake_ext_path(name)) if os.path.exists(ext_path): os.remove(ext_path) self.mkpath(os.path.dirname(ext_path)) if self.bundle_cython_cpp: self._bundle_cython_cpp(name, build_lib) print('Moving built C-extension', built_path, 'to build path', ext_path) shutil.move(built_path, ext_path) self._found_names.append(name) if os.path.exists(self.get_ext_built_api_header(name)): shutil.move(self.get_ext_built_api_header(name), pjoin(os.path.dirname(ext_path), name + '_api.h')) if self.bundle_arrow_cpp: self._bundle_arrow_cpp(build_prefix, build_lib) if self.with_plasma and self.bundle_plasma_executable: # Move the plasma store source = os.path.join(self.build_type, "plasma-store-server") target = os.path.join(build_lib, self._get_build_dir(), "plasma-store-server") shutil.move(source, target) def _bundle_arrow_cpp(self, build_prefix, build_lib): print(pjoin(build_lib, 'pyarrow')) move_shared_libs(build_prefix, build_lib, "arrow") move_shared_libs(build_prefix, build_lib, "arrow_python") if self.with_cuda: move_shared_libs(build_prefix, build_lib, "arrow_cuda") if self.with_flight: move_shared_libs(build_prefix, build_lib, "arrow_flight") move_shared_libs(build_prefix, build_lib, "arrow_python_flight") if self.with_dataset: move_shared_libs(build_prefix, build_lib, "arrow_dataset") if self.with_plasma: move_shared_libs(build_prefix, build_lib, "plasma") if self.with_gandiva: move_shared_libs(build_prefix, build_lib, "gandiva") if self.with_parquet and not self.with_static_parquet: move_shared_libs(build_prefix, build_lib, "parquet") if not self.with_static_boost and self.bundle_boost: move_shared_libs( build_prefix, build_lib, "{}_regex".format(self.boost_namespace), implib_required=False) def _bundle_cython_cpp(self, name, lib_path): cpp_generated_path = self.get_ext_generated_cpp_source(name) if not os.path.exists(cpp_generated_path): raise RuntimeError('expected to find generated C++ file ' 'in {0!r}'.format(cpp_generated_path)) # The destination path to move the generated C++ source to # (for Cython source coverage) cpp_path = pjoin(lib_path, self._get_build_dir(), os.path.basename(cpp_generated_path)) if os.path.exists(cpp_path): os.remove(cpp_path) print('Moving generated C++ source', cpp_generated_path, 'to build path', cpp_path) shutil.move(cpp_generated_path, cpp_path) def _failure_permitted(self, name): if name == '_parquet' and not self.with_parquet: return True if name == '_plasma' and not self.with_plasma: return True if name == '_orc' and not self.with_orc: return True if name == '_flight' and not self.with_flight: return True if name == '_s3fs' and not self.with_s3: return True if name == '_hdfs' and not self.with_hdfs: return True if name == '_dataset' and not self.with_dataset: return True if name == '_dataset_orc' and not ( self.with_orc and self.with_dataset ): return True if name == '_cuda' and not self.with_cuda: return True if name == 'gandiva' and not self.with_gandiva: return True return False def _get_build_dir(self): # Get the package directory from build_py build_py = self.get_finalized_command('build_py') return build_py.get_package_dir('pyarrow') def _get_cmake_ext_path(self, name): # This is the name of the arrow C-extension filename = name + ext_suffix return pjoin(self._get_build_dir(), filename) def get_ext_generated_cpp_source(self, name): if sys.platform == 'win32': head, tail = os.path.split(name) return pjoin(head, tail + ".cpp") else: return pjoin(name + ".cpp") def get_ext_built_api_header(self, name): if sys.platform == 'win32': head, tail = os.path.split(name) return pjoin(head, tail + "_api.h") else: return pjoin(name + "_api.h") def get_ext_built(self, name): if sys.platform == 'win32': head, tail = os.path.split(name) # Visual Studio seems to differ from other generators in # where it places output files. if self.cmake_generator.startswith('Visual Studio'): return pjoin(head, self.build_type, tail + ext_suffix) else: return pjoin(head, tail + ext_suffix) else: return pjoin(self.build_type, name + ext_suffix) def get_names(self): return self._found_names def get_outputs(self): # Just the C extensions # regular_exts = _build_ext.get_outputs(self) return [self._get_cmake_ext_path(name) for name in self.get_names()] def move_shared_libs(build_prefix, build_lib, lib_name, implib_required=True): if sys.platform == 'win32': # Move all .dll and .lib files libs = [lib_name + '.dll'] if implib_required: libs.append(lib_name + '.lib') for filename in libs: shutil.move(pjoin(build_prefix, filename), pjoin(build_lib, 'pyarrow', filename)) else: _move_shared_libs_unix(build_prefix, build_lib, lib_name) def _move_shared_libs_unix(build_prefix, build_lib, lib_name): shared_library_prefix = 'lib' if sys.platform == 'darwin': shared_library_suffix = '.dylib' else: shared_library_suffix = '.so' lib_filename = (shared_library_prefix + lib_name + shared_library_suffix) # Also copy libraries with ABI/SO version suffix if sys.platform == 'darwin': lib_pattern = (shared_library_prefix + lib_name + ".*" + shared_library_suffix[1:]) libs = glob.glob(pjoin(build_prefix, lib_pattern)) else: libs = glob.glob(pjoin(build_prefix, lib_filename) + '*') if not libs: raise Exception('Could not find library:' + lib_filename + ' in ' + build_prefix) # Longest suffix library should be copied, all others ignored and can be # symlinked later after the library has been installed libs.sort(key=lambda s: -len(s)) print(libs, libs[0]) lib_filename = os.path.basename(libs[0]) shutil.move(pjoin(build_prefix, lib_filename), pjoin(build_lib, 'pyarrow', lib_filename)) # If the event of not running from a git clone (e.g. from a git archive # or a Python sdist), see if we can set the version number ourselves default_version = '6.0.1' if (not os.path.exists('../.git') and not os.environ.get('SETUPTOOLS_SCM_PRETEND_VERSION')): os.environ['SETUPTOOLS_SCM_PRETEND_VERSION'] = \ default_version.replace('-SNAPSHOT', 'a0') # See https://github.com/pypa/setuptools_scm#configuration-parameters scm_version_write_to_prefix = os.environ.get( 'SETUPTOOLS_SCM_VERSION_WRITE_TO_PREFIX', setup_dir) def parse_git(root, **kwargs): """ Parse function for setuptools_scm that ignores tags for non-C++ subprojects, e.g. apache-arrow-js-XXX tags. """ from setuptools_scm.git import parse kwargs['describe_command'] =\ 'git describe --dirty --tags --long --match "apache-arrow-[0-9].*"' return parse(root, **kwargs) def guess_next_dev_version(version): if version.exact: return version.format_with('{tag}') else: def guess_next_version(tag_version): return default_version.replace('-SNAPSHOT', '') return version.format_next_version(guess_next_version) with open('README.md') as f: long_description = f.read() class BinaryDistribution(Distribution): def has_ext_modules(foo): return True install_requires = ( 'numpy >= 1.16.6', ) # Only include pytest-runner in setup_requires if we're invoking tests if {'pytest', 'test', 'ptr'}.intersection(sys.argv): setup_requires = ['pytest-runner'] else: setup_requires = [] if strtobool(os.environ.get('PYARROW_INSTALL_TESTS', '1')): packages = ['pyarrow', 'pyarrow.tests'] else: packages = ['pyarrow'] setup( name='pyarrow', packages=packages, zip_safe=False, package_data={'pyarrow': ['*.pxd', '*.pyx', 'includes/*.pxd']}, include_package_data=True, distclass=BinaryDistribution, # Dummy extension to trigger build_ext ext_modules=[Extension('__dummy__', sources=[])], cmdclass={ 'build_ext': build_ext }, entry_points={ 'console_scripts': [ 'plasma_store = pyarrow:_plasma_store_entry_point' ] }, use_scm_version={ 'root': os.path.dirname(setup_dir), 'parse': parse_git, 'write_to': os.path.join(scm_version_write_to_prefix, 'pyarrow/_generated_version.py'), 'version_scheme': guess_next_dev_version }, setup_requires=['setuptools_scm', 'cython >= 0.29'] + setup_requires, install_requires=install_requires, tests_require=['pytest', 'pandas', 'hypothesis'], python_requires='>=3.6', description='Python library for Apache Arrow', long_description=long_description, long_description_content_type='text/markdown', classifiers=[ 'License :: OSI Approved :: Apache Software License', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', ], license='Apache License, Version 2.0', maintainer='Apache Arrow Developers', maintainer_email='dev@arrow.apache.org', test_suite='pyarrow.tests', url='https://arrow.apache.org/' )