#!/bin/sh
# should be without the page number
my_url="http://auctions.overstock.com/cgi-bin/auctions.cgi?PAGE=PRODDET&PRODUCTID="
temp_file=/var/tmp/temp_file.log;
# start for random page number
start_num=1000000;
# end for random page number
end_num=9999999;
# useragent of the emulated browser
useragent='Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)';

# array with clients
#pos_client[1]="wget"; 
pos_client[2]="lynx";
#pos_client[3]="fetch";
pos_client[4]="links";
# bulk data to cut from the output
bulk_data="info@auctions.overstock.com";
# number of pages to get
pages_to_get=100;
log_file=/var/tmp/emails.txt;

# banner;]
banner () {
echo 'E-mail grasp V. 0.5 - Written by hip0'
echo '-------------------------------------' 
}




# choose web client to use as a page fetcher
choose_client () {
#if [ $(which ${pos_client[1]} 2>/dev/null) ]; then
#have_wget="$(which ${pos_client[1]})" ;
#have_wget_opts="-q -O ${temp_file} ";
#fi

if [ $(which ${pos_client[2]} 2>/dev/null) ]; then
have_lynx="$(which ${pos_client[2]})";
have_lynx_opts="-useragent='$useragent' -dump";
fi

#if [ $(which ${pos_client[3]} 2>/dev/null) ]; then
#have_fetch="$(which ${pos_client[3]}) ";
#have_fetch_opts="-o ${temp_file} ";
#fi

if [ $(which ${pos_client[4]} 2>/dev/null) ]; then
have_links="$(which ${pos_client[4]})";
have_links_opts="-dump";
fi

# loop over possible clients and choose one stored in use_client
for f in ${have_wget} ${have_lynx} ${have_fetch} ${have_links}; do
if [ "${f}" ]; then
use_client="${f}";
#echo ${use_client};
break;
fi
done

}

# Function to grasp needed page
grasp_page () {
for i in $(seq 1 $pages_to_get); do

# get random number using perl's rand();
random_val=$(perl -e 'print int(rand(\$end_number))');

# limit number to be a bigger number than our ${start_num}
if [ ${random_val} > ${start_num} ]; then
#echo $random_val

# select client options
#if [ "${have_wget_opts}" ] && [ "${use_client}" = "${have_wget}" ]; then
#opts="${have_wget_opts}";
##echo have_wget_opts
#fi

if [ "${have_lynx_opts}" ] && [ "${use_client}" = "${have_lynx}" ]; then
opts="${have_lynx_opts}";
#echo have_lynx_opts
fi

if [ "${have_links_opts}" ] && [ "${use_client}" = "${have_links}" ]; then
opts="${have_links_opts}";
#echo have_links_opts
fi

#if [ "${have_fetch_opts}" ] && [ "${use_client}" = "${have_fetch}" ]; then
#opts="${have_fetch_opts}";
##echo have_fetch_opts
#fi

# if client is not wget or fetch ...
#if [ "${use_client}" != "${have_wget}" ] || [ "${use_client}" != "${have_fetch}" ]; then
${use_client} ${opts} "${my_url}${random_val}" 2>/dev/null|grep "\@"|grep -v ${bulk_data}>>${log_file};
#fi

#>>${log_file}
#fi

# if client is wget or fetch ...
#if [ "${use_client}" = "${have_wget}" ] || [ "${use_client}" = "${have_fetch}" ]; then
#
#echo "${use_client} "${opts}" "${my_url}${random_val}" 2>/dev/null";

#if [ -f ${temp_file} ]; then
#cat ${temp_file}|grep "\@">>$log_file;
#rm -f ${temp_file};
#fi
#
#fi

##if [ "${have_fetch}" = "${have_fetch}" ]; then
#${use_client} "${opts}" "${my_url}${random_val}";

#if [ -f ${temp_file} ]; then
#cat ${temp_file}|grep "\@" >>$log_file;
#rm -f ${temp_file};
#fi

##fi

fi
done

}

# parse our log file.
parse_log_file () {
cat /var/tmp/emails.txt |sed -e 's#mailto##g' -e 's#\"##g' -e 's#:# #g' -e 's#\[# #g' \
-e 's#\]# #g' -e 's#(# #' -e 's#)# #' -e 's#=# #' -e 's#\*# #g' -e 's#\^# #g' -e 's#\$# #g' \
-e 's$\#$ $g' -e 's#%# #g' -e 's#\!# #g'|sort ${log_file}|uniq>"${log_file}".1
mv "${log_file}".1 ${log_file}
exit 0;
}

# main
main () {
banner;
choose_client;
grasp_page;
parse_log_file;
}
main;
