Source code for utils.find_duplicates

#!/usr/bin/python3
"""
    This script should find duplicates e.g. specific template is same as shared one
"""
import sys
import os
import re
import glob
import argparse


[docs] def recursive_globi(mask): """ Simple replacement of glob.globi(mask, recursive=true) Reason: Older Python versions support """ parts = mask.split("**/") if not len(parts) == 2: raise NotImplementedError search_root = parts[0] # instead of '*' use regex '.*' path_mask = parts[1].replace("*", ".*") re_path_mask = re.compile(path_mask + "$") for root, dirnames, filenames in os.walk(search_root): dirnames.sort() filenames.sort() paths = filenames + dirnames for path in paths: full_path = os.path.join(root, path) if re_path_mask.search(full_path): yield full_path
[docs] class DuplicatesFinder(object): def __init__(self, root_dir, specific_dirs_mask, shared_dir, shared_files_mask): self._root_dir = root_dir self._specific_dirs_mask = os.path.join(root_dir, specific_dirs_mask) self._shared_dir = os.path.join(root_dir, shared_dir) self._clear_normalized() self._shared_files_mask = shared_files_mask def _clear_normalized(self): self._normalized = {} def _get_normalized(self, file_path): """ Return cached normalized content of file :param file_path: :return: """ if file_path in self._normalized: return self._normalized[file_path] with open(file_path, 'r') as content_file: content = content_file.read() normalized = self._normalize_content(content) self._normalized[file_path] = normalized return normalized def _compare_files(self, shared_filename, specific_filename): if not os.path.isfile(specific_filename): return False shared_normalized = self._get_normalized(shared_filename) specific_normalized = self._get_normalized(specific_filename) return shared_normalized == specific_normalized def _print_match(self, first_filename, second_filename): print("Duplicate found! {}\t=>\t{}".format(first_filename, second_filename))
[docs] def search(self): """ :return: True if any duplicate found """ found = False self._clear_normalized() specific_dirs = list(self._specific_dirs()) # Walk all shared files shared_files_mask = os.path.join(self._shared_dir, self._shared_files_mask) for shared_filename in sorted(glob.glob(shared_files_mask)): basename = os.path.basename(shared_filename) # Walk all specific dirs for specific_dir in specific_dirs: # Get file to compare specific_filename = os.path.join(specific_dir, basename) # Compare if self._compare_files(shared_filename, specific_filename): found = True self._print_match(shared_filename, specific_filename) return found
def _specific_dirs(self): for static_path in recursive_globi(self._specific_dirs_mask): if not static_path.startswith(self._shared_dir): yield static_path def _normalize_content(self, content): return content
[docs] class BashDuplicatesFinder(DuplicatesFinder): def __init__(self, root_dir, specific_dirs_mask, shared_dir, shared_files_mask="*.sh"): DuplicatesFinder.__init__(self, root_dir, specific_dirs_mask, shared_dir, shared_files_mask) def _normalize_content(self, content): # remove comments # naive implementation (todo) content = re.sub(r"^\s*#.*", "", content) # remove empty lines content = "\n".join([s for s in content.split("\n") if s]) return content
[docs] class OvalDuplicatesFinder(DuplicatesFinder): def __init__(self, root_dir, specific_dirs_mask, shared_dir, shared_files_mask="*.xml"): DuplicatesFinder.__init__(self, root_dir, specific_dirs_mask, shared_dir, shared_files_mask) def _normalize_content(self, content): # remove comments # naive implementation (todo) content = re.sub(r"^\s*#.*", "", content) # bash style comments - due to #platform content = re.sub('<!--.*?-->', "", content, flags=re.DOTALL) # xml comments # remove empty lines content = "\n".join([s for s in content.split("\n") if s]) return content
[docs] def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("root_ssg_directory", help="Path to root of ssg git repository") return parser.parse_args()
[docs] def main(): """ main function """ args = parse_args() root_dir = args.root_ssg_directory without_duplicates = True # Static bash scripts print("Static bash files:") static_bash_finder = BashDuplicatesFinder( root_dir, os.path.join("**", "fixes", "bash"), os.path.join("shared", "fixes", "bash") ) if static_bash_finder.search(): without_duplicates = False # Templates bash scripts print("Bash templates:") template_bash_finder = BashDuplicatesFinder( root_dir, os.path.join("**", "templates"), os.path.join("shared", "templates"), "template_BASH_*" ) if template_bash_finder.search(): without_duplicates = False # Static oval files print("Static oval files:") static_oval_finder = OvalDuplicatesFinder( root_dir, os.path.join("**", "checks", "oval"), os.path.join("shared", "checks", "oval") ) if static_oval_finder.search(): without_duplicates = False # Templates oval files print("Templates oval files:") templates_oval_finder = OvalDuplicatesFinder( root_dir, os.path.join("**", "templates"), os.path.join("shared", "templates"), "template_OVAL_*" ) if templates_oval_finder.search(): without_duplicates = False # Scan results if without_duplicates: print("No duplicates found") sys.exit(0) else: print("Duplicates found!") sys.exit(1)
if __name__ == "__main__": main()