Friday, October 12, 2012

A script to split a file tree into separate trees - one per file extension present in the original tree

Purpose

Have you ever had a tree of files from which you only needed certain types of file? For example, I had an iTunes library with some Apple files from another iTunes account combined with a large number of MP3s. I wanted to pull out the tree of MP3s only. You can make such a tree by passing a combination of flags to rsync that make it do an exclusive include.

How?

Pass the following flags to rsync to make it do an exclusive include for files fitting a certain globbing pattern. Fill in for the variables of course, if you want to use this line alone.

In particular, this rsync line:

rsync -av --include '*/' --include "*.${extension}" --exclude '*' ${source_directory}/ ${top_directory_of_results}/${extension}/

The script:

==========================================================

This tool reads a directory of files that have extensions and then copies each type of file to its own tree.

The location of each file in the subtree matches that file's location in the original tree.

Usage:

 ./split_by_file_extension.sh \
{-s source directory|--source-dir=source directory }\
{-t top directory of results|--top-directory-of-results=top directory of results}\
{-e comma,separated,list,of,extensions | --extensions=comma,separated,list,of,extensions}


==========================================================

#!/bin/bash

set -e
set -u

find_of_files="./find.of.files.$$"

usage () {

 echo "=========================================================="
 echo "This tool reads a directory of files that have extensions"
 echo "and then copies each type of file to its own tree."
 echo ""
 echo "The location of each file in the subtree matches that"
 echo "file's location in the original tree."
 echo ""
 echo "Usage: $0 {-s source directory|--source-dir=source directory} \ "
 echo "          {-t top directory of results|--top-directory-of-results=top directory of results} \ "
 echo "          {-e comma,separated,list,of,extensions | --extensions=comma,separated,list,of,extensions} "
 echo "=========================================================="
}

are_these_the_same_path () {

 original_directory="`pwd`"
 cd "$1"
 first_directory="`pwd`"
 cd "${original_directory}"
 cd "$2"
 second_directory="`pwd`"
 cd "${original_directory}"

 if [ "${first_directory}" = "${second_directory}" ]
 then
  echo true
 else
  echo false
 fi

}

if [ $# -eq 0 ]
then
 usage
 exit 1
fi

needed_number_of_arguments_set=0

while [ $# -gt 0 ]
do
 case $1 in
  -s|--source-dir=*)
   if [ "$1" = "-s" ]
   then
    shift
    source_directory="$1"
    shift
   else
    source_directory="`echo $1| sed s,--source-dir=,,`"
    shift
   fi
   echo "Source Directory: ${source_directory}"
   if [ ! -d ${source_directory} ]
   then
    echo""
    echo "source_directory is not a directory."
    echo ""
    usage
    exit 1
   fi
   needed_number_of_arguments_set="`echo ${needed_number_of_arguments_set} + 1| bc`"
  ;;
  -e|--extensions=*)
   if [ "$1" = "-e" ]
   then
    shift
    extensions="$1"
    shift
   else
    extensions="`echo $1| sed s#--extensions=##`"
    shift
   fi
   echo "Extensions: ${extensions}"
   needed_number_of_arguments_set="`echo ${needed_number_of_arguments_set} + 1| bc`"
  ;;
  -t|--top-directory-of-results=*)
   if [ "$1" = "-t" ]
   then
    shift
    top_directory_of_results="$1"
    shift
   else
    top_directory_of_results="`echo $1| sed s,--top-directory-of-results=,,`"
    shift
   fi
   echo "Target Directory: ${top_directory_of_results}"
   if [ ! -d ${top_directory_of_results} ]
   then
    echo""
    echo "top_directory_of_results is not a directory."
    echo ""
    usage
    exit 1
   fi
   needed_number_of_arguments_set="`echo ${needed_number_of_arguments_set} + 1| bc`"
  ;;
  -h|--help)
   usage
   exit 0
  ;;
  *)
   echo ""
   echo "Unrecognized flag." 1>&2
   usage
   exit 1
  ;;
 esac
done

if [ "${needed_number_of_arguments_set}" -ne "3" ]
then
 echo""
 echo "All of the options must be set." 1>&2
 usage
 exit 1
fi

are_source_directory_and_top_directory_of_results_the_same="`are_these_the_same_path ${source_directory} ${top_directory_of_results}`"

if [ "${are_source_directory_and_top_directory_of_results_the_same}" = true ]
then
 echo ""
 echo "source_directory and top_directory_of_results cannot be the same." 1>&2
 echo ""
 usage
 exit 1
fi

#######################################
#
# Main Process.
#
# Do a find for files.
# Check for files with extensions provided.
# Get directory path for files with listed extensions.
# Make the path for that file on the extension directory in the target directory.
# Copy files from source tree to the specific path in the target tree with rsync. 
#
#######################################

for extension in `echo "${extensions}" | sed s/,/\ /g`
do
  if [ ! -d ${top_directory_of_results}/${extension} ]
  then
     mkdir ${top_directory_of_results}/${extension}
  fi
done

for extension in `echo "${extensions}" | sed s/,/\ /g`
do
  rsync -av --include '*/' --include "*.${extension}" --exclude '*' ${source_directory}/ ${top_directory_of_results}/${extension}/
done