#!/bin/sh
# D. Gibbon, 2004-04-29
# Unix shell script for creating ranked word frequency list
#=====================================================================
# Define file name variables
INFILE=lexicography2004-04-27.txt
# WORDS:
OUT_0=lex_out_wordlist.txt
OUT_1=lex_out_alphasorted.txt
OUT_2=lex_out_numericsorted.txt
OUT_3=lex_out_ranked.txt
# DIGRAMS:
OUT_4=digram_out_list.txt
OUT_5=digram_out_alphasorted.txt
OUT_6=digram_out_numericsorted.txt
OUT_7=digram_out_ranked.txt
#=====================================================================
# Remove punctuation marks and store
cat $INFILE |
sed "s/[-,.:<>)(/\"\'\`]//g" |
tr "[" " " |
tr "]" " " |
tr " " "\012" |
tr "\011" "\012" |
tr "[A-Z]" "[a-z]" |
grep -v "^$" |
tee $OUT_0 |
#=====================================================================
# Alphabetic sort with duplicate count/removal and store
sort |
uniq -c |
tee $OUT_1 |
#=====================================================================
# Numeric sort and store
sort -rn |
tee $OUT_2 |
#=====================================================================
# Collect number of items and sum of occurrences
gawk '
BEGIN {
sum=0;
}
{
sum=sum+$1;
}
END {
print NR, sum;
}
' > tmp.0
#=====================================================================
# Print file header
echo "-------------------------------------------------------------" > $OUT_3
echo Word frequency list >> $OUT_3
echo File: $INFILE >> $OUT_3
echo Date: `date` >> $OUT_3
echo "-------------------------------------------------------------" >> $OUT_3
#=====================================================================
# Print frequency information (rank, word, occurrences, percent frequency)
gawk '
BEGIN {
count=ARGV[1];
sum=ARGV[2];
ARGV[1]=""
ARGV[2]=""
print "Total items:", count;
print "Total occurrences:", sum;
print "Type:token ratio:", count/sum;
print "-------------------------------------------------------------";
}
{
print NR ": <" $2 ">", $1, "("100*$1/sum "%)";
}
' `cat tmp.0` $OUT_2 >> $OUT_3
#=====================================================================
#=====================================================================
# Digram frequency list
tail +2 $OUT_0 > tmp.1
wc -l tmp.1 | gawk '{print $1}' > tmp.2
cat $OUT_0 | head -n`cat tmp.2` > tmp.3
paste tmp.3 tmp.1 |
tee $OUT_4 |
#=====================================================================
# Alphabetic sort with duplicate count/removal and store
sort |
uniq -c |
tee $OUT_5 |
#=====================================================================
# Numeric sort and store
sort -rn |
tee $OUT_6 |
#=====================================================================
# Collect number of items and sum of occurrences
gawk '
BEGIN {
sum=0;
}
{
sum=sum+$1;
}
END {
print NR, sum;
}
' > tmp.4
#=====================================================================
# Print file header
echo "-------------------------------------------------------------" > $OUT_7
echo Digram frequency list >> $OUT_7
echo File: $INFILE >> $OUT_7
echo Date: `date` >> $OUT_7
echo "-------------------------------------------------------------" >> $OUT_7
#=====================================================================
# Print frequency information (rank, word, occurrences, percent frequency)
gawk '
BEGIN {
count=ARGV[1];
sum=ARGV[2];
ARGV[1]=""
ARGV[2]=""
print "Total items:", count;
print "Total occurrences:", sum;
print "Type:token ratio:", count/sum;
print "-------------------------------------------------------------";
}
{
print NR ": <" $2 "," $3 ">", $1, "("100*$1/sum "%)";
}
' `cat tmp.4` $OUT_6 >> $OUT_7
#=====================================================================
# EOF