#!/bin/sh # D. Gibbon, 2004-04-29 # Unix shell script for creating ranked word frequency list #===================================================================== # Define file name variables INFILE=lexicography2004-04-27.txt # WORDS: OUT_0=lex_out_wordlist.txt OUT_1=lex_out_alphasorted.txt OUT_2=lex_out_numericsorted.txt OUT_3=lex_out_ranked.txt # DIGRAMS: OUT_4=digram_out_list.txt OUT_5=digram_out_alphasorted.txt OUT_6=digram_out_numericsorted.txt OUT_7=digram_out_ranked.txt #===================================================================== # Remove punctuation marks and store cat $INFILE | sed "s/[-,.:<>)(/\"\'\`]//g" | tr "[" " " | tr "]" " " | tr " " "\012" | tr "\011" "\012" | tr "[A-Z]" "[a-z]" | grep -v "^$" | tee $OUT_0 | #===================================================================== # Alphabetic sort with duplicate count/removal and store sort | uniq -c | tee $OUT_1 | #===================================================================== # Numeric sort and store sort -rn | tee $OUT_2 | #===================================================================== # Collect number of items and sum of occurrences gawk ' BEGIN { sum=0; } { sum=sum+$1; } END { print NR, sum; } ' > tmp.0 #===================================================================== # Print file header echo "-------------------------------------------------------------" > $OUT_3 echo Word frequency list >> $OUT_3 echo File: $INFILE >> $OUT_3 echo Date: `date` >> $OUT_3 echo "-------------------------------------------------------------" >> $OUT_3 #===================================================================== # Print frequency information (rank, word, occurrences, percent frequency) gawk ' BEGIN { count=ARGV[1]; sum=ARGV[2]; ARGV[1]="" ARGV[2]="" print "Total items:", count; print "Total occurrences:", sum; print "Type:token ratio:", count/sum; print "-------------------------------------------------------------"; } { print NR ": <" $2 ">", $1, "("100*$1/sum "%)"; } ' `cat tmp.0` $OUT_2 >> $OUT_3 #===================================================================== #===================================================================== # Digram frequency list tail +2 $OUT_0 > tmp.1 wc -l tmp.1 | gawk '{print $1}' > tmp.2 cat $OUT_0 | head -n`cat tmp.2` > tmp.3 paste tmp.3 tmp.1 | tee $OUT_4 | #===================================================================== # Alphabetic sort with duplicate count/removal and store sort | uniq -c | tee $OUT_5 | #===================================================================== # Numeric sort and store sort -rn | tee $OUT_6 | #===================================================================== # Collect number of items and sum of occurrences gawk ' BEGIN { sum=0; } { sum=sum+$1; } END { print NR, sum; } ' > tmp.4 #===================================================================== # Print file header echo "-------------------------------------------------------------" > $OUT_7 echo Digram frequency list >> $OUT_7 echo File: $INFILE >> $OUT_7 echo Date: `date` >> $OUT_7 echo "-------------------------------------------------------------" >> $OUT_7 #===================================================================== # Print frequency information (rank, word, occurrences, percent frequency) gawk ' BEGIN { count=ARGV[1]; sum=ARGV[2]; ARGV[1]="" ARGV[2]="" print "Total items:", count; print "Total occurrences:", sum; print "Type:token ratio:", count/sum; print "-------------------------------------------------------------"; } { print NR ": <" $2 "," $3 ">", $1, "("100*$1/sum "%)"; } ' `cat tmp.4` $OUT_6 >> $OUT_7 #===================================================================== # EOF