User:Ap/LaTeX conversion utility
Appearance
< User:Ap
#!/usr/bin/perl -w
##
## Copyright (C) 2003 Arno W. Peters.
## released under GNU GPL version 2 or higher.
use strict;
use DBI();
sub texheader {
print "\\documentclass[10pt,english,a4paper,twocolumn]{book}\n";
print "\\usepackage[latin1]{inputenc}\n";
print "\\usepackage[T1]{fontenc}\n";
print "\\usepackage{geometry}\n";
print "\\usepackage[cm]{fullpage}\n";
print "\\usepackage{babel}\n";
print "\n";
print "\\def\\sup#1{\\ensuremath{^#1}}\n";
print "\\def\\sub#1{\\ensuremath{_#1}}\n";
print "\\def\\slash{/}\n";
print "\\catcode`\\/=\\active\n";
print "\\def/{\\slash\\discretionary{}{}{}}\n";
print "\n";
print "\\begin{document}\n";
}
sub texfooter {
print "\\end{document}\n";
}
sub article2tex {
my ($title, $text) = @_;
my @math;
my $n = 0;
my $tmp;
$title =~ s/_/ /g;
$title =~ s/&/\\&/g;
### while ($tmp =~ m%<math>(.*)</math>%) {
### $math[$n] = $1;
### $n++;
### $tmp = $';
### }
$text =~ s/\r//g;
$text =~ s/\\/\//g;
$text =~ s/\$/\\\$/g;
$text =~ s/\^/\\\^{\ }/g;
$text =~ s/{/\\{/g;
$text =~ s/}/\\}/g;
$text =~ s/ë/\\"e/g;
$text =~ s/ö/\\"o/g;
$text =~ s/ü/\\"u/g;
$text =~ s/ï/\\"{\\i}/g;
$text =~ s/á/\\'a/g;
$text =~ s/é/\\'e/g;
$text =~ s/è/\\`e/g;
$text =~ s/É/\\'E/g;
$text =~ s/α/\\ensuremath{\\alpha}/g;
$text =~ s/Α/A/g;
$text =~ s/β/\\ensuremath{\\beta}/g;
$text =~ s/Β/B/g;
$text =~ s/γ/\\ensuremath{\\gamma}/g;
$text =~ s/Γ/\\ensuremath{\\Gamma}/g;
$text =~ s/δ/\\ensuremath{\\delta}/g;
$text =~ s/Δ/\\ensuremath{\\Delta}/g;
$text =~ s/ε/\\ensuremath{\\epsilon}/g;
$text =~ s/Ε/E/g;
$text =~ s/ζ/\\ensuremath{\\zeta}/g;
$text =~ s/Ζ/Z/g;
$text =~ s/η/\\ensuremath{\\eta}/g;
$text =~ s/Η/E/g;
$text =~ s/θ/\\ensuremath{\\theta}/g;
$text =~ s/Θ/\\ensuremath{\\Theta}/g;
$text =~ s/ι/\\ensuremath{\\iota}/g;
$text =~ s/Ι/I/g;
$text =~ s/κ/\\ensuremath{\\kappa}/g;
$text =~ s/Κ/K/g;
$text =~ s/λ/\\ensuremath{\\lambda}/g;
$text =~ s/Λ/\\ensuremath{\\Lambda}/g;
$text =~ s/μ/\\ensuremath{\\mu}/g;
$text =~ s/Μ/M/g;
$text =~ s/ν/\\ensuremath{\\nu}/g;
$text =~ s/Ν/N/g;
$text =~ s/ξ/\\ensuremath{\\xi}/g;
$text =~ s/Ξ/\\ensuremath{\\Xi}/g;
$text =~ s/π/\\ensuremath{\\pi}/g;
$text =~ s/Π/\\ensuremath{\\Pi}/g;
$text =~ s/ρ/\\ensuremath{\\rho}/g;
$text =~ s/Ρ/R/g;
$text =~ s/σ/\\ensuremath{\\sigma}/g;
$text =~ s/Σ/\\ensuremath{\\Sigma}/g;
$text =~ s/τ/\\ensuremath{\\tau}/g;
$text =~ s/Τ/\\ensuremath{\\Tau}/g;
$text =~ s/υ/\\ensuremath{\\upsilon}/g;
$text =~ s/Υ/\\ensuremath{\\Upsilon}/g;
$text =~ s/φ/\\ensuremath{\\phi}/g;
$text =~ s/Φ/\\ensuremath{\\Phi}/g;
$text =~ s/χ/\\ensuremath{\\chi}/g;
$text =~ s/Χ/X/g;
$text =~ s/ψ/\\ensuremath{\\psi}/g;
$text =~ s/Ψ/\\ensuremath{\\Psi}/g;
$text =~ s/ω/\\ensuremath{\\omega}/g;
$text =~ s/Ω/\\ensuremath{\\Omega}/g;
$text =~ s/²/\\sup{2}/g;
$text =~ s/ /~/g;
$text =~ s/°/\\ensuremath{^\\circ}/g;
$text =~ s/</\\ensuremath{<}/g;
$text =~ s/>/\\ensuremath{>}/g;
$text =~ s/≤/\\ensuremath{\\leq}/g;
$text =~ s/≥/\\ensuremath{\\geq}/g;
$text =~ s/↑/\\ensuremath{\\uparrow}/g;
$text =~ s/↓/\\ensuremath{\\downarrow}/g;
$text =~ s/→/\\ensuremath{\\rightarrow}/g;
$text =~ s/←/\\ensuremath{\\leftarrow}/g;
$text =~ s/∀/\\ensuremath{\\forall}/g;
$text =~ s/∈/\\ensuremath{\\in}/g;
$text =~ s/∪/\\ensuremath{\\cup}/g;
$text =~ s/∫/\\ensuremath{\\int}/g;
$text =~ s/—/---/g;
$text =~ s/♣/\\ensuremath{\\clubsuit}}/g;
$text =~ s/♠/\\ensuremath{\\spadesuit}}/g;
$text =~ s/♦/\\ensuremath{\\diamondsuit}}/g;
$text =~ s/♥/\\ensuremath{\\heartsuit}}/g;
$text =~ s/^#redirect/See /i;
$text =~ s/&/\\&/g;
$text =~ s/#/\\#/g;
$text =~ s/%/\\%/g;
$text =~ s/_/\\_/g;
$text =~ s/''''''(.+?)''''''/\{\}$1\{\}/mg;
$text =~ s/'''''(.+?)'''''/\\emph{\\textbf{$1}}/mg;
$text =~ s/''''(.+?)''''/\\textbf{'$1'}/mg;
$text =~ s/'''(.+?)'''/\\textbf{$1}/mg;
$text =~ s/''(.+?)''/\\emph{$1}/mg;
## Ignore verbatim
## $text =~ s/^ (.*)$/\\begin{verbatim}$1\n\\end{verbatim}/mg;
$text =~ s/^=====(.*)=====/\\subparagraph*{$1}\n/mg;
$text =~ s/^====(.*)====/\\paragraph*{$1}\n/mg;
$text =~ s/^===(.*)===/\\subsubsection*{$1}\n/mg;
$text =~ s/^==(.*)==/\\subsection*{$1}\n/mg;
$text =~ s/(\d)\-(\d)/$1--$2/mg;
$text =~ s/^\*\*\*/\\par\\noindent\\hangindent=6em\\hskip5em\\llap{\\ensuremath{\\bullet}}\\quad /mg;
$text =~ s/^\*\*/\\par\\noindent\\hangindent=4em\\hskip3em\\llap{\\ensuremath{\\bullet}}\\quad /mg;
$text =~ s/^\*/\\par\\noindent\\hangindent=2em\\quad\\llap{\\ensuremath{\\bullet}}\\quad /mg;
$text =~ s/^:::/\\par\\noindent\\hangindent=6em\\hskip6em /mg;
$text =~ s/^::/\\par\\noindent\\hangindent=4em\\hskip4em /mg;
$text =~ s/^:/\\par\\noindent\\hangindent=2em\\qquad /mg;
$text =~ s/^\\#\\#/\\par\\noindent\\hangindent=4em\\hskip3em\\llap{\\#}\\quad /mg;
$text =~ s/^\\#/\\par\\noindent\\hangindent=2em\\quad\\llap{\\#}\\quad /mg;
$text =~ s/<li[^>]*?>/\\par\\noindent\\qquad /ig;
$text =~ s/<\/li>/ /ig;
$text =~ s/<dd[^>]*?>/\\par\\noindent\\qquad /ig;
$text =~ s/<\/dd>/ /ig;
$text =~ s/<ul[^>]*?>/\n\n/ig;
$text =~ s/<\/ul>/ /ig;
$text =~ s/<dl[^>]*?>/\n\n/ig;
$text =~ s/<\/dl>/ /ig;
$text =~ s/<ol[^>]*?>/\n\n/ig;
$text =~ s/<\/ol>/ /ig;
$text =~ s/<p[^>]*?>/\n\n/ig;
$text =~ s/<\/p>/ /ig;
$text =~ s/<h1>/\\subsection*{/ig;
$text =~ s/<h2>/\\subsubsection*{/ig;
$text =~ s/<h3>/\\paragraph*{/ig;
$text =~ s/<h4>/\\subparagraph*{/ig;
$text =~ s/<\/h\d>/}\n/ig;
$text =~ s/<blockquote>/\\begin{quotation}\n/ig;
$text =~ s/<\/blockquote>/\\end{quotation}\n/ig;
$text =~ s/<strong>/\\textbf{/ig;
$text =~ s/<\/strong>/}/ig;
$text =~ s/<b>/\\textbf{/ig;
$text =~ s/<\/b>/}/ig;
$text =~ s/<i>/\\emph{/ig;
$text =~ s/<\/i>/}/ig;
$text =~ s/<var>/\\emph{/ig;
$text =~ s/<\/var>/}/ig;
$text =~ s/<emph>/\\emph{/ig;
$text =~ s/<\/emph>/}/ig;
$text =~ s/<sup>/\\sup{/ig;
$text =~ s/<\/sup>/}/ig;
$text =~ s/<sub>/\\sub{/ig;
$text =~ s/<\/sub>/}/ig;
$text =~ s/<small>/{\\small /ig;
$text =~ s/<\/small>/}/ig;
$text =~ s/<br[^>]*?>/\\\\[.5\\baselineskip]/ig;
$text =~ s/----+/\\vskip.25\\baselineskip \\hbox to\\hsize{\\hfil\\vrule width5cm height1pt\\hfil}\\vskip.25\\baselineskip /g;
$text =~ s/<hr[^>]*?>/\\vskip.25\\baselineskip \\hbox to\\hsize{\\hfil\\vrule width5cm height1pt\\hfil}\\vskip.25\\baselineskip /ig;
## Ignore tables for now
$text =~ s/<table[^>]*?>/ /ig;
$text =~ s/<\/table>/} /ig;
$text =~ s/<tr[^>]*?>/ /ig;
$text =~ s/<\/tr>/ /ig;
$text =~ s/<td[^>]*?>/ /ig;
$text =~ s/<\/td>/ /ig;
$text =~ s/<th[^>]*?>/ /ig;
$text =~ s/<\/th>/ /ig;
$text =~ s/<center[^>]*?>/\\begin{center}\n/ig;
$text =~ s/<\/center>/\\end{center}\n/ig;
$text =~ s/<div[^>]*?>/ /ig;
$text =~ s/<\/div>/ /ig;
$text =~ s/<font[^>]*?>//ig;
$text =~ s/<\/font>//ig;
$text =~ s/²/\\sup{2}/g;
$text =~ s/³/\\sup{3}/g;
$text =~ s/’/'/g;
$text =~ s/–//g;
$text =~ s/—//g;
print "\\section*{$title}\n\n";
print "\\begingroup\n";
print $text;
print "\\par\\endgroup\n";
print "\n\n";
}
# Connect to the database.
my $dbh = DBI->connect("DBI:mysql:database=wikipedia-en;host=localhost",
"arno", "",
{'RaiseError' => 1});
die "No argument specified" if $#ARGV == -1;
my $letter = shift;
print STDERR "Generating all articles starting with $letter.\n";
# Now retrieve data from the table.
my $sth = $dbh->prepare("SELECT cur_title, cur_text FROM cur " .
"WHERE cur_title LIKE '$letter%' and cur_namespace = 0");
$sth->execute;
texheader();
while (my $ref = $sth->fetchrow_hashref) {
article2tex($ref->{'cur_title'}, $ref->{'cur_text'});
}
$sth->finish;
texfooter();
# Disconnect from the database.
$dbh->disconnect;