Skip to content

Commit 3fd287d

Browse files
committed
Pair header and impl files in different dirs
This is the initial implementation of an algorithm to pair header and implementation files in different directories. The expected/average pairing complexity is changed from O(N) to O(N*logN). The worst case is O(N^2). Moreover, the algorithm can result in false positive pairing of unrelated header and implementation files within a single project. The approach is based on heuristics that header and implementation file locations have similar directory structures, so this is a general case of when these files are in the same directory. The basic idea is to sort and group candidate files by common directory structure. Note that these candidate files have the same filename w/o extensions. The heuristics pairs the files with the most number of common ancestor directories starting from the files (not the root directory!). The implementation went through minimal (manual) testing, and it should get tested on real projects. Closes #19.
1 parent 1bfe67e commit 3fd287d

File tree

1 file changed

+65
-17
lines changed

1 file changed

+65
-17
lines changed

cppdep.py

Lines changed: 65 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from __future__ import print_function, division, absolute_import
2626

2727
import argparse as ap
28+
import collections
2829
import fnmatch
2930
import glob
3031
import itertools
@@ -429,16 +430,27 @@ def construct_components(self):
429430
430431
Unpaired c files are counted as incomplete components with warnings.
431432
"""
432-
hpaths = []
433-
cpaths = []
433+
file_type = collections.namedtuple('File', ['rev_path', 'path'])
434+
hpaths = collections.defaultdict(list)
435+
cpaths = collections.defaultdict(list)
436+
437+
# This approach is pessimistic with O(N*logN) instead of O(N)
438+
# because it assumes the header and implementation files
439+
# are likely to be in different directories.
440+
def _reverse(path):
441+
path = strip_ext(path).split(os.path.sep)
442+
path.reverse()
443+
return path
434444

435445
def _select_src_file(root, filename):
436446
full_path = os.path.join(root, filename)
437447
if any(fnmatch.fnmatch(full_path, x) for x in self.ignore_paths):
438448
return
439449
src_match = Package._RE_SRC.match(filename)
440450
if src_match:
441-
(hpaths if src_match.group('h') else cpaths).append(full_path)
451+
(hpaths if src_match.group('h')
452+
else cpaths)[strip_ext(filename)].append(
453+
file_type(_reverse(full_path), full_path))
442454

443455
def _gather_files(dir_path):
444456
for root, _, files in os.walk(dir_path):
@@ -454,20 +466,56 @@ def _gather_files(dir_path):
454466
else:
455467
_select_src_file(*os.path.split(src_path))
456468

457-
# This approach assumes
458-
# that the header and implementation are in the same directory.
459-
# TODO: Implement less-restricted, general pairing.
460-
cbases = dict((strip_ext(x), x) for x in cpaths)
461-
for hpath in hpaths:
462-
cpath = None
463-
key = strip_ext(hpath)
464-
if key in cbases:
465-
cpath = cbases[key]
466-
del cbases[key]
467-
self.components.append(Component(hpath, cpath, self))
468-
469-
for cpath in cbases.values():
470-
self.components.append(Component(None, cpath, self))
469+
self.__pair_files(hpaths, cpaths)
470+
471+
def __pair_files(self, hpaths, cpaths):
472+
"""Pairs header and implementation files into components."""
473+
# This should probably be solved with a graph algorithm.
474+
# Find the nodes with the longest matching consecutive ancestors
475+
# starting from the node (not the root!).
476+
# The nodes represent the file and directory names.
477+
#
478+
# The association is indeterminate or ambiguous
479+
# if multiple nodes share the same common ancestors of the same number.
480+
# Therefore, the algorithm to find
481+
# the lowest common ancestor seems to lead to false answers.
482+
def _num_consecutive_ancestors(file_one, file_two):
483+
return sum(1 for _ in itertools.takewhile(lambda x: x[0] == x[1],
484+
zip(file_one.rev_path,
485+
file_two.rev_path)))
486+
487+
def _pair(hfiles, cfiles):
488+
assert hfiles and cfiles # Expected to have few elements.
489+
candidates = [(x, sorted(((_num_consecutive_ancestors(x, y), y)
490+
for y in hfiles), reverse=True))
491+
for x in cfiles]
492+
candidates.sort(reverse=True,
493+
key=lambda x: tuple(y for y, _ in x[1]))
494+
for cfile, hfile_candidates in candidates:
495+
for _, hfile in hfile_candidates:
496+
if hfile in hfiles:
497+
yield hfile.path, cfile.path
498+
hfiles.remove(hfile)
499+
break
500+
else:
501+
yield None, cfile.path
502+
503+
for hfile in hfiles:
504+
yield hfile.path, None
505+
506+
for filename, hfiles in hpaths.items():
507+
if filename not in cpaths:
508+
self.components.extend(
509+
Component(x.path, None, self) for x in hfiles)
510+
else:
511+
cfiles = cpaths[filename]
512+
del cpaths[filename]
513+
self.components.extend(
514+
Component(x, y, self) for x, y in _pair(hfiles, cfiles))
515+
516+
for cfiles in cpaths.values():
517+
self.components.extend(
518+
Component(None, x.path, self) for x in cfiles)
471519

472520
def dependencies(self):
473521
"""Returns dependency packages."""

0 commit comments

Comments
 (0)