File: //proc/self/root/proc/self/root/usr/lib/python3.6/site-packages/sos/cleaner/archives/__init__.py
# Copyright 2020 Red Hat, Inc. Jake Hunsaker <jhunsake@redhat.com>
# This file is part of the sos project: https://github.com/sosreport/sos
#
# This copyrighted material is made available to anyone wishing to use,
# modify, copy, or redistribute it subject to the terms and conditions of
# version 2 of the GNU General Public License.
#
# See the LICENSE file in the source distribution for further information.
import logging
import os
import shutil
import stat
import tarfile
import tempfile
import re
from concurrent.futures import ProcessPoolExecutor
from sos.utilities import file_is_binary
# python older than 3.8 will hit a pickling error when we go to spawn a new
# process for extraction if this method is a part of the SoSObfuscationArchive
# class. So, the simplest solution is to remove it from the class.
def extract_archive(archive_path, tmpdir):
with tarfile.open(archive_path) as archive:
path = os.path.join(tmpdir, 'cleaner')
# set extract filter since python 3.12 (see PEP-706 for more)
# Because python 3.10 and 3.11 raises false alarms as exceptions
# (see #3330 for examples), we can't use data filter but must
# fully trust the archive (legacy behaviour)
archive.extraction_filter = getattr(tarfile, 'fully_trusted_filter',
(lambda member, path: member))
# Guard against "Arbitrary file write during tarfile extraction"
# Checks the extracted files don't stray out of the target directory.
for member in archive.getmembers():
member_path = os.path.join(path, member.name)
abs_directory = os.path.abspath(path)
abs_target = os.path.abspath(member_path)
prefix = os.path.commonprefix([abs_directory, abs_target])
if prefix != abs_directory:
raise Exception(f"Attempted path traversal in tarfle"
f"{prefix} != {abs_directory}")
archive.extract(member, path)
return os.path.join(path, archive.name.split('/')[-1].split('.tar')[0])
class SoSObfuscationArchive():
"""A representation of an extracted archive or an sos archive build
directory which is used by SoSCleaner.
Each archive that needs to be obfuscated is loaded into an instance of this
class. All report-level operations should be contained within this class.
"""
files_obfuscated_count = 0
total_sub_count = 0
removed_file_count = 0
type_name = 'undetermined'
description = 'undetermined'
is_nested = False
prep_files = {}
def __init__(self, archive_path, tmpdir, keep_binary_files):
self.archive_path = archive_path
self.final_archive_path = self.archive_path
self.tmpdir = tmpdir
self.archive_name = self.archive_path.split('/')[-1].split('.tar')[0]
self.ui_name = self.archive_name
self.soslog = logging.getLogger('sos')
self.ui_log = logging.getLogger('sos_ui')
self.skip_list = self._load_skip_list()
self.is_extracted = False
self._load_self()
self.archive_root = ''
self.keep_binary_files = keep_binary_files
self.parsers = ()
self.log_info(
f"Loaded {self.archive_path} as type {self.description}"
)
def obfuscate_string(self, string_data):
for parser in self.parsers:
try:
string_data = parser.parse_string_for_keys(string_data)
except Exception as err:
self.log_info(f"Error obfuscating string data: {err}")
return string_data
# TODO: merge content to obfuscate_arc_files as that is the only place we
# call obfuscate_filename ?
def obfuscate_filename(self, short_name, filename):
_ob_short_name = self.obfuscate_string(short_name.split('/')[-1])
_ob_filename = short_name.replace(short_name.split('/')[-1],
_ob_short_name)
if _ob_filename != short_name:
arc_path = filename.split(short_name)[0]
_ob_path = os.path.join(arc_path, _ob_filename)
# ensure that any plugin subdirs that contain obfuscated strings
# get created with obfuscated counterparts
if not os.path.islink(filename):
os.rename(filename, _ob_path)
else:
# generate the obfuscated name of the link target
_target_ob = self.obfuscate_string(os.readlink(filename))
# remove the unobfuscated original symlink first, in case the
# symlink name hasn't changed but the target has
os.remove(filename)
# create the newly obfuscated symlink, pointing to the
# obfuscated target name, which may not exist just yet, but
# when the actual file is obfuscated, will be created
os.symlink(_target_ob, _ob_path)
def set_parsers(self, parsers):
self.parsers = parsers # TODO: include this in __init__?
def load_parser_entries(self):
self.soslog = logging.getLogger('sos')
self.ui_log = logging.getLogger('sos_ui')
for parser in self.parsers:
parser.load_map_entries()
def obfuscate_line(self, line, parsers=None):
"""Run a line through each of the obfuscation parsers, keeping a
cumulative total of substitutions done on that particular line.
Positional arguments:
:param line str: The raw line as read from the file being
processed
:param parsers: A list of parser objects to obfuscate
with. If None, use all.
Returns the fully obfuscated line and the number of substitutions made
"""
# don't iterate over blank lines, but still write them to the tempfile
# to maintain the same structure when we write a scrubbed file back
count = 0
if not line.strip():
return line, count
if parsers is None:
parsers = self.parsers
for parser in parsers:
try:
line, _count = parser.parse_line(line)
count += _count
except Exception as err:
self.log_debug(f"failed to parse line: {err}", parser.name)
return line, count
def obfuscate_arc_files(self, flist):
self.load_parser_entries()
for filename in flist:
self.log_debug(f" pid={os.getpid()}: obfuscating {filename}")
try:
short_name = filename.split(self.archive_name + '/')[1]
if self.should_skip_file(short_name):
continue
if (not self.keep_binary_files and
self.should_remove_file(short_name)):
# We reach this case if the option --keep-binary-files
# was not used, and the file is in a list to be removed
self.remove_file(short_name)
continue
if (self.keep_binary_files and
(file_is_binary(filename) or
self.should_remove_file(short_name))):
# We reach this case if the option --keep-binary-files
# is used. In this case we want to make sure
# the cleaner doesn't try to clean a binary file
continue
if os.path.islink(filename):
# don't run the obfuscation on the link, but on the actual
# file at some other point.
continue
_parsers = [
_p for _p in self.parsers if not
any(
_skip.match(short_name) for _skip in _p.skip_patterns
)
]
if not _parsers:
self.log_debug(
f"Skipping obfuscation of {short_name or filename} "
f"due to matching file skip pattern"
)
continue
self.log_debug(f"Obfuscating {short_name or filename}")
subs = 0
with tempfile.NamedTemporaryFile(mode='w', dir=self.tmpdir) \
as tfile:
with open(filename, 'r', encoding='utf-8',
errors='replace') as fname:
for line in fname:
try:
line, cnt = self.obfuscate_line(line, _parsers)
subs += cnt
tfile.write(line)
except Exception as err:
self.log_debug(f"Unable to obfuscate "
f"{short_name}: {err}")
tfile.seek(0)
if subs:
shutil.copyfile(tfile.name, filename)
self.update_sub_count(subs)
self.obfuscate_filename(short_name, filename)
except Exception as err:
self.log_debug(f" pid={os.getpid()}: caught exception on "
f"obfuscating file {filename}: {err}")
return (self.files_obfuscated_count, self.total_sub_count,
self.removed_file_count)
@classmethod
def check_is_type(cls, arc_path):
"""Check if the archive is a well-known type we directly support"""
raise NotImplementedError
@property
def is_sos(self):
return 'sos' in self.__class__.__name__.lower()
@property
def is_insights(self):
return 'insights' in self.type_name
def _load_self(self):
if self.is_tarfile:
# pylint: disable=consider-using-with
self.tarobj = tarfile.open(self.archive_path)
def get_nested_archives(self):
"""Return a list of ObfuscationArchives that represent additional
archives found within the target archive. For example, an archive from
`sos collect` will return a list of ``SoSReportArchive`` objects.
This should be overridden by individual types of ObfuscationArchive's
"""
return []
def get_archive_root(self):
"""Set the root path for the archive that should be prepended to any
filenames given to methods in this class.
"""
if self.is_tarfile:
toplevel = self.tarobj.firstmember
if toplevel.isdir():
return toplevel.name
return os.path.dirname(toplevel.name) or os.sep
return os.path.abspath(self.archive_path)
def report_msg(self, msg):
"""Helper to easily format ui messages on a per-report basis"""
self.ui_log.info(f"{self.ui_name + ' :':<50} {msg}")
def _fmt_log_msg(self, msg, caller=None):
return f"[cleaner{f':{caller}' if caller else ''}" \
f"[{self.archive_name}]] {msg}"
def log_debug(self, msg, caller=None):
self.soslog.debug(self._fmt_log_msg(msg, caller))
def log_info(self, msg, caller=None):
self.soslog.info(self._fmt_log_msg(msg, caller))
def log_error(self, msg, caller=None):
self.soslog.error(self._fmt_log_msg(msg, caller))
def _load_skip_list(self):
"""Provide a list of files and file regexes to skip obfuscation on
Returns: list of files and file regexes
"""
return [
'proc/kallsyms',
'sosreport-',
'sys/firmware',
'sys/fs',
'sys/kernel/debug',
'sys/module'
]
@property
def is_tarfile(self):
try:
return tarfile.is_tarfile(self.archive_path)
except Exception:
return False
def remove_file(self, fname):
"""Remove a file from the archive. This is used when cleaner encounters
a binary file, which we cannot reliably obfuscate.
"""
full_fname = self.get_file_path(fname)
# don't call a blank remove() here
if full_fname:
self.log_info(f"Removing binary file '{fname}' from archive")
os.remove(full_fname)
self.removed_file_count += 1
def format_file_name(self, fname):
"""Based on the type of archive we're dealing with, do whatever that
archive requires to a provided **relative** filepath to be able to
access it within the archive
"""
if not self.is_extracted:
if not self.archive_root:
self.archive_root = self.get_archive_root()
return os.path.join(self.archive_root, fname)
return os.path.join(self.extracted_path, fname)
def get_file_content(self, fname):
"""Return the content from the specified fname. Particularly useful for
tarball-type archives so we can retrieve prep file contents prior to
extracting the entire archive
"""
if self.is_extracted is False and self.is_tarfile:
filename = self.format_file_name(fname)
try:
return self.tarobj.extractfile(filename).read().decode('utf-8')
except KeyError:
self.log_debug(
f"Unable to retrieve {fname}: no such file in archive"
)
return ''
else:
try:
with open(self.format_file_name(fname), 'r',
encoding='utf-8') as to_read:
return to_read.read()
except Exception as err:
self.log_debug(f"Failed to get contents of {fname}: {err}")
return ''
def extract(self, quiet=False):
if self.is_tarfile:
if not quiet:
self.report_msg("Extracting...")
self.extracted_path = self.extract_self()
self.is_extracted = True
self.tarobj = None # we can't pickle this & not further needed
else:
self.extracted_path = self.archive_path
# if we're running as non-root (e.g. collector), then we can have a
# situation where a particular path has insufficient permissions for
# us to rewrite the contents and/or add it to the ending tarfile.
# Unfortunately our only choice here is to change the permissions
# that were preserved during report collection
if os.getuid() != 0:
self.log_debug('Verifying permissions of archive contents')
for dirname, dirs, files in os.walk(self.extracted_path):
try:
for _dir in dirs:
_dirname = os.path.join(dirname, _dir)
_dir_perms = os.stat(_dirname).st_mode
os.chmod(_dirname, _dir_perms | stat.S_IRWXU)
for filename in files:
fname = os.path.join(dirname, filename)
# protect against symlink race conditions
if not os.path.exists(fname) or os.path.islink(fname):
continue
if (not os.access(fname, os.R_OK) or not
os.access(fname, os.W_OK)):
self.log_debug(
"Adding owner rw permissions to "
f"{fname.split(self.archive_path)[-1]}"
)
os.chmod(fname, stat.S_IRUSR | stat.S_IWUSR)
except Exception as err:
self.log_debug(f"Error while trying to set perms: {err}")
self.log_debug(f"Extracted path is {self.extracted_path}")
def rename_top_dir(self, new_name):
"""Rename the top-level directory to new_name, which should be an
obfuscated string that scrubs the hostname from the top-level dir
which would be named after the unobfuscated sos report
"""
_path = self.extracted_path.replace(self.archive_name, new_name)
self.archive_name = new_name
os.rename(self.extracted_path, _path)
self.extracted_path = _path
def get_compression(self):
"""Return the compression type used by the archive, if any. This is
then used by SoSCleaner to generate a policy-derived compression
command to repack the archive
"""
if self.is_tarfile:
if self.archive_path.endswith('xz'):
return 'xz'
return 'gz'
return None
def build_tar_file(self, method):
"""Pack the extracted archive as a tarfile to then be re-compressed
"""
mode = 'w'
tarpath = self.extracted_path + '-obfuscated.tar'
compr_args = {}
if method:
mode += f":{method}"
tarpath += f".{method}"
if method == 'xz':
compr_args = {'preset': 3}
else:
compr_args = {'compresslevel': 6}
self.log_debug(f"Building tar file {tarpath}")
with tarfile.open(tarpath, mode=mode, **compr_args) as tar:
tar.add(self.extracted_path,
arcname=os.path.split(self.archive_name)[1])
return tarpath
def compress(self, method):
"""Execute the compression command, and set the appropriate final
archive path for later reference by SoSCleaner on a per-archive basis
"""
try:
self.final_archive_path = self.build_tar_file(method)
except Exception as err:
self.log_debug(f"Exception while re-compressing archive: {err}")
raise
self.log_debug(f"Compressed to {self.final_archive_path}")
try:
self.remove_extracted_path()
except Exception as err:
self.log_debug(f"Failed to remove extraction directory: {err}")
self.report_msg('Failed to remove temporary extraction directory')
def remove_extracted_path(self):
"""After the tarball has been re-compressed, remove the extracted path
so that we don't take up that duplicate space any longer during
execution
"""
try:
self.log_debug(f"Removing {self.extracted_path}")
shutil.rmtree(self.extracted_path)
except OSError:
os.chmod(self.extracted_path, stat.S_IWUSR)
if os.path.isfile(self.extracted_path):
os.remove(self.extracted_path)
else:
shutil.rmtree(self.extracted_path)
def extract_self(self):
"""Extract an archive into our tmpdir so that we may inspect it or
iterate through its contents for obfuscation
"""
with ProcessPoolExecutor(1) as _pool:
_path_future = _pool.submit(extract_archive,
self.archive_path, self.tmpdir)
path = _path_future.result()
return path
def get_symlinks(self):
"""Iterator for a list of symlinks in the archive"""
for dirname, dirs, files in os.walk(self.extracted_path):
for _dir in dirs:
_dirpath = os.path.join(dirname, _dir)
if os.path.islink(_dirpath):
yield _dirpath
for filename in files:
_fname = os.path.join(dirname, filename)
if os.path.islink(_fname):
yield _fname
def get_files(self):
"""Iterator for a list of files in the archive, to allow clean to
iterate over.
Will not include symlinks, as those are handled separately
"""
for dirname, _, files in os.walk(self.extracted_path):
for filename in files:
_fname = os.path.join(dirname, filename.lstrip('/'))
if not os.path.islink(_fname):
yield _fname
def get_directory_list(self):
"""Return a list of all directories within the archive"""
dir_list = []
for dirname, _, _ in os.walk(self.extracted_path):
dir_list.append(dirname)
return dir_list
def update_sub_count(self, count):
"""Called when a file has finished being parsed and used to track
total substitutions made and number of files that had changes made
"""
self.files_obfuscated_count += 1
self.total_sub_count += count
def get_file_path(self, fname):
"""Return the filepath of a specific file within the archive so that
it may be selectively inspected if it exists
"""
_path = os.path.join(self.extracted_path, fname.lstrip('/'))
return _path if os.path.exists(_path) else ''
def should_skip_file(self, filename):
"""Checks the provided filename against a list of filepaths to not
perform obfuscation on, as defined in self.skip_list
Positional arguments:
:param filename str: Filename relative to the extracted
archive root
"""
if (not os.path.isfile(self.get_file_path(filename)) and not
os.path.islink(self.get_file_path(filename))):
return True
for _skip in self.skip_list:
if filename.startswith(_skip) or re.match(_skip, filename):
return True
return False
def should_remove_file(self, fname):
"""Determine if the file should be removed or not, due to an inability
to reliably obfuscate that file based on the filename.
:param fname: Filename relative to the extracted archive root
:type fname: ``str``
:returns: ``True`` if the file cannot be reliably obfuscated
:rtype: ``bool``
"""
obvious_removes = [
r'.*\.gz$', # TODO: support flat gz/xz extraction
r'.*\.xz$',
r'.*\.bzip2$',
r'.*\.tar\..*', # TODO: support archive unpacking
r'.*\.txz$',
r'.*\.tgz$',
r'.*\.bin$',
r'.*\.journal$',
r'.*\~$'
]
# if the filename matches, it is obvious we can remove them without
# doing the read test
for _arc_reg in obvious_removes:
if re.match(_arc_reg, fname):
return True
_full_path = self.get_file_path(fname)
if os.path.isfile(_full_path):
return file_is_binary(_full_path)
# don't fail on dir-level symlinks
return False
# vim: set et ts=4 sw=4 :