functions.sh/com/index.sh

122 lines
3.6 KiB
Bash
Executable File

#!/bin/bash
#
# author: @jakimfett
# license: cc-by-sa
#
# usage: index.sh <folder>
# Creates a sorted, cleaned index of files in a given directory.
# What to index?
projectName='functions.sh'
cacheDir='cache'
workingDir="$(pwd)"
aliases="${workingDir}/com/usr/aliases.src"
source "${aliases}"
# okayso mlocate is a good thought, but the performance is poor and it's not always available.
#
# functions.sh is about minimalism, and removing a dependency would be nice,
# especially at this point.
# we have to assume there's an index.
# and we are going to assume that there's occasionally more than one.
# so reading a standardized multi-piece index is the first hurdle.
#
# let's start small
# my intention:
# read in all the indexes, output to a single file, after sorting and de-duplicating.
# this was my first psuedo-code for it.
# cat folder.dex.* >> "${cache}/index/${projectName}.${chronoStamp}.dex"; exit
#
# That didn't work, we need to locate where stuff is first.
#
# the -name flag for find seems relevant.
echo "${workingDir}${cache}/index/${projectName}.$(chronoStamp).dex"
echo "finding ${projectName}:"
#find ~ -name "${projectName}" -type d 2>/dev/null
#>> "${cache}/index/${projectName}.${chronoStamp}.dex"
exit
########### Include functions ###########
if [[ `find functions.sh 2>&1` == *"No such file"* ]];then echo "functions.sh not found, exiting";exit 1;fi
clear; source "`dirname "$0"`/functions.sh"; when.sh;echo
logThis "loaded:'functions.sh'"
########### End include functions #######
if [ ! -z "${1}" ]; then
TARGETDIR="$1"
DUPEDIR="/Users/jakimfett/Downloads/dupes/"
cd $TARGETDIR
TEMPOUTPUT="${TEMPFOLDER}/`basename ${TARGETDIR}`.index.tmp"
touch ${TEMPOUTPUT}.md5sum
echo "Temp index will be created at '${TEMPOUTPUT}'"
echo "Directory size:"
du -hs .
echo
echo "starting file indexing:"
find . | sort | uniq > "${TEMPOUTPUT}"
#if [ ! -f "${TEMPOUTPUT}" ] ; then
# echo "Skipping creation of index, remove '${TEMPOUTPUT}' to regen."
#fi
# if [ ! -f "${TEMPOUTPUT}.md5sum" ] ; then
# during loops, fail early when possible
while read path; do
# check if variable is empty
if [ ! -z "${path}" ] ; then
# check if it's a directory
if [ ! -d "${path}" ] ; then
fileName="`basename ${path}`"
# expensive processing, but necessary eventually
# @todo - refactor to multiprocess this?
# @todo use rhash --sha3-512, requires homebrew on OSX?
# @todo switch to using keccak
fileMD5="`md5 -q "${path}"`"
echo -n '.'
# this is a horrible hack but it works (for now)
fileSize="`echo $(wc -c <"${path}") | xargs`"
# @todo - refactor file size calculation method
dupe="`grep "${fileMD5}" "${path}"`"
# file is a duplicate
if [ ! -z "${dupe}" ]; then
echo "new file: ${fileMD5} ${path}"
echo "duplicate file: ${dupe}"
# mv '${path}' '${DUPEDIR}'
fi
echo "${fileMD5} | ${fileSize} | ${path}" >> "${TEMPOUTPUT}.md5sum"
fi
fi
done <"${TEMPOUTPUT}"
#echo "# File list" > ${TEMPOUTPUT}.list
#echo "md5\t|\tfilepath" >> ${TEMPOUTPUT}.list
#threshold=1
#cat ${TEMPOUTPUT}.md5sum | awk '{print $2}' | sort | uniq -c | awk -v threshold="$threshold" '$1 > threshold' > ${TEMPOUTPUT}.counts
echo "file indexing done"
else
logThis "input a folder path please"
fi