bib-format 10.3 KB
Newer Older
1
2
3
4
#!/bin/bash
# Format bibtex files in JabRef like style

function usage() {
5
6
7
8
9
10
11
12
13
14
    cat 1>&2 <<EOF
$(basename $0): Format bibtex files in JabRef like style

USAGE: $(basename $0) bibfiles
Options:
  -h  print this help
  -n  show changes only
  -u  do not change field contents
  -p  print non ascii characters
EOF
15
16
17
18
19
20
21
    exit $1
}
if [ $# -lt 1 ]; then usage 1; fi

# Default options.
INPLACE=1 # Update source file.
EDITVALUES=1 # Edit selected field values.
22
NONASCII=0 # Do not print non ascii characters.
23
24
25
26
27
28
29
30
31

# Parse command line options.
for arg; do
    # With option '-h' print usage and exit.
    if [[ "$arg" == "-h" ]]; then usage 0; fi
    # With option '-n' only show diff.
    if [[ "$arg" == "-n" ]]; then INPLACE=0; fi
    # With option '-u' keep field values unchanged.
    if [[ "$arg" == "-u" ]]; then EDITVALUES=""; fi
32
33
    # With option '-p' print non ascii characters.
    if [[ "$arg" == "-p" ]]; then NONASCII=1; fi
34
35
36
done


37
38
39
40
41
42
# Function to replace double quotes by curly brackets.
function replace_quotes() { # replace_quotes bibfile
    sed -i -r '
s/^  ([A-Za-z][A-Za-z_-]*) = "/  \1 = {/ # Replace opening quote by {.
T                # End here if there was no replacement.
:a               # Label with name a.
43
44
45
46
47
48
h                # Keep a copy of the line (hold space).
s/"[^"]*"//g     # Remove all quote pairs to get the modulo 2.
tb;:b            # Go to label b to disregard previous substitution.
s/"//            # In case of odd number of quotes, remove the last.
g                # Restore the line from copy (hold space).
Tc               # In case of even number go to label c.
49
s/" *(,?)$/}\1/  # Replace closing quote by }.
50
:c               # Label with name c to skip previous line.
51
52
53
54
55
56
t                # End here if replacement was successful.
N                # Append next line to search closing quote.
ba               # Go to label a to repeat search.
' "$1"
}

57
58
59
60
61
62
63
# Function to insert brackets around non quoted field contents.
function insert_brackets() { # insert_brackets bibfile
    sed -i -r '
s/^  ([A-Za-z][A-Za-z_-]*) = ([^{].*[^},])(,*)$/  \1 = {\2}\3/
' "$1"
}

64
65
66
# Function to merge lines.
function merge_lines() { # merge_lines bibfilename
    sed -i ':a # Label with name a.
67
/{$/s//& /     # Blank to match opening bracket at line end in the next row.
68
/^  [A-Za-z][A-Za-z_-]* = {.*\([^},]\|[^}] *,$\)$/N # Append line if incomplete.
69
s/{[\n \t]*/{/       # At left curly bracket remove newline and whitespace.
70
71
72
s/[ \t]*\n[ \t]*}/}/ # At right curly bracket remove whitespace and newline.
s/[ \t]*\n[ \t]*/ /g # At other places replace space+newline by single space.
/[^},]$/N            # If bib field is still incomplete, append next line.
73
//ba                 # Go to label a to repeat this for multiple-line fields.
74
/ *\(},*\)$/s//\1/   # Remove whitespace before trailing curly bracket.
75
76
77
78
79
80
81
82
83
84
85
/}$/{N;s/\n *,/,/}   # Remove newline between trailing curly bracket and comma.
' "$1"
}


# Iterate bib files.
for bibfile in "$@"
do
    # Skip options.
    test "$bibfile" != "-n" || continue
    test "$bibfile" != "-u" || continue
86
    test "$bibfile" != "-p" || continue
87

88
89
    # Select pair of pdf and bib file from either filename.
    option="$bibfile"
90
91
    pdffile="${bibfile%.bib}"
    bibfile="${pdffile}.bib"
92
93
94
95

    # Skip if not a regular file.
    test -f "$bibfile" || {
        [[ "$bibfile" =~ ^- ]] \
96
            && echo "$(basename $0): $option: Unknown option" >/dev/stderr \
97
98
99
            || echo "$(basename $0): $bibfile: File not found" >/dev/stderr;
        continue
    }
100

101
102
103
104
105
106
107
108
109
    # Only print non ascii characters with option '-p'.
    if [[ $NONASCII -eq 1 ]]
    then
        grep -P -n -H "[^\x00-\x7F]" "$bibfile" \
            | grep -v [Aa]bstract \
            | grep --color='auto' -P -n "[^\x00-\x7F]"
        continue
    fi

110
111
112
113
114
115
116
117
118
119
120
121
122
    # Create a temporary copy.
    tmpfile="$(mktemp)"
    cp -a "$bibfile" "$tmpfile"

    # Remove utf8 BOM.
    sed -i $'1s/^\uFEFF//' "$tmpfile"

    # Remove CR line feeds.
    sed -i 's/\r//' "$tmpfile"

    # Remove trailing whitespace.
    sed -i 's/[\t ][\t ]*$//' "$tmpfile"

User expired's avatar
User expired committed
123
124
125
    # Merge whitespace.
    sed -i 's/  */ /g' "$tmpfile"

126
    # Indent with two spaces.
127
    sed -i -r 's/^[ \t]*([A-Za-z][^={"]*[^ ={"])[ \t]*=[ \t]*/  \1 = /' "$tmpfile"
128

129
    # Title case entry types.
130
131
    sed -i -r 's/^@([A-Za-z][A-Za-z_-]*) *\{/@\L\u\1{/' "$tmpfile"
    sed -i -r 's/^  ([A-Za-z][A-Za-z_-]*) = /  \L\u\1 = /' "$tmpfile"
132
133

    # Use curly brackets {} instead of double quotes "" or no quotes.
134
    sed -i 's/"} *$/"\n}/' "$tmpfile" # Closing brace on separate line.
135
    replace_quotes "$tmpfile"
136
    insert_brackets "$tmpfile"
137

138
    # Remove preceding and trailing whitespace in field contents.
139
    sed -i 's/  *\(},\?\)$/\1/' "$tmpfile"
140
    sed -i -r '/\{ /s/^(  [A-Za-z][A-Za-z_-]* = \{) +/\1/' "$tmpfile"
141

142
143
    # Remove double curly brackets around fields (but not "{{...} ... {...}}").
    sed -i -r '/\{\{/{
144
145
      /^  [A-Za-z][A-Za-z_-]* = *\{\{([^{}]*)(\{[^{}]*\}[^{}]*)*\}\}[^}]*$/{
        s/^(  [A-Za-z][A-Za-z_-]* = *\{)\{(.*)\}(\}[^}]*)$/\1\2\3/
146
147
      }
    }' "$tmpfile"
148

149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
    # Append single blank line.
    sed -i '$G' "$tmpfile"

    # Merge blank lines.
    sed -i '/^$/{:b;N;s/\n$//;tb}' "$tmpfile"

    # Remove preceding blank line.
    sed -i '1{N;s/^\n//}' "$tmpfile"

    # Merge lines.
    merge_lines "$tmpfile"
    merge_lines "$tmpfile"

    if [ "$EDITVALUES" ]
    then

165
        # Replace some utf8 signs (do not change abstract).
166
        sed -i "/^  [Aa]bstract/!{
167
                y/׳’“”/''\"\"/
168
169
                s/[   ­   ] / /g
                y/‐−–   ­   /---       /
170
171
172
                s/—/---/
                s/fi/fi/
                s/fl/fl/
173
            }" "$tmpfile"
174

175
        # Replace utf8 characters by latex (except in abstract field).
176
        bib-convl -i "$tmpfile" -f '/^  [Aa]bstract\|^  [Uu]rl/!'
User expired's avatar
User expired committed
177

178
179
180
        # Escape percent signs (TeX comment, also escaped in abstract!).
        sed -i '/^  [Uu]rl/!{/%/s/\([^\\]\)%/\1\\%/g}' "$tmpfile"

181
        # Escape ampersands.
182
        sed -i '/^  [Aa]bstract\|^  [Uu]rl/!{/&/s/\([^\\]\)&/\1\\\&/g}' "$tmpfile"
183

184
185
186
        # Remove footnote signs (author and editor list).
        sed -i '/^  \(Author\|Editor\) = {/s/[*‡]//g' "$tmpfile"

187
188
        # Avoid duplicate and or and with comma.
        sed -i '/^  \(Author\|Editor\) = {/s/\<and and\>/and/g' "$tmpfile"
189
        sed -i '/^  \(Author\|Editor\) = {/s/,  *and\>/ and/g' "$tmpfile"
190
        sed -i '/^  \(Author\|Editor\) = {/s/\([.,]\)and\>/\1 and/g' "$tmpfile"
191

192
193
194
195
        # Single spacing for abbreviated forenames (two times for »A.B.C.«).
        sed -i '/^  \(Author\|Editor\) = {/s/\([A-Z]\.\)\([A-Z]\.[ }]\)/\1 \2/g' "$tmpfile"
        sed -i '/^  \(Author\|Editor\) = {/s/\([A-Z]\.\)\([A-Z]\.[ }]\)/\1 \2/g' "$tmpfile"

196
        # Always put dots after capital letters for forename abbreviation.
197
198
        sed -i '/^  \(Author\|Editor\) = {/s/\(\<[A-Z]\>\)\(}*[, ]\)/\1.\2/g' "$tmpfile"
        sed -i '/^  \(Author\|Editor\) = {/s/\(\<[A-Z]\>\)\(} *,\) *$/\1.\2/g' "$tmpfile"
199

200
        # Single spacing after commas.
User expired's avatar
User expired committed
201
202
        sed -i '/^  \(Author\|Editor\) = {/s/, */, /g;s/ $//' "$tmpfile"
        sed -i '/^  \(Author\|Editor\) = {/s/, */, /g;s/ $//' "$tmpfile"
203

204
        # Double dash and whitespace in page ranges.
205
        sed -i '/^  Pages = {.*[ 0-9]--*[ 0-9]/s/\([0-9]\) *--* *\([0-9]\)/\1--\2/' "$tmpfile"
206
        sed -i '/^  Pages = {.*[A-Z][ 0-9]*--*[ A-Z]*[0-9]/s/\([0-9]\) *--* *\([A-Z][0-9]\)/\1--\2/' "$tmpfile"
207

208
209
210
211
        # Double dash and whitespace in number ranges.
        sed -i '/^  Number = {.*[ 0-9]--*[ 0-9]/s/\([0-9]\) *--* *\([0-9]\)/\1--\2/' "$tmpfile"
        sed -i '/^  Number = {.*[A-Z][ 0-9]*--*[ A-Z]*[0-9]/s/\([0-9]\) *--* *\([A-Z][0-9]\)/\1--\2/' "$tmpfile"

User expired's avatar
User expired committed
212
        # Remove prefix »The « from journal names (permits journal abbreviation).
213
214
        sed -i '/^  Journal = {The /s/{The /{/' "$tmpfile"

User expired's avatar
User expired committed
215
        # Remove white space around journal names (permits journal abbreviation).
216
217
        sed -i '/^  Journal = {/{s/ = {  */ = {/;s/  *},/},/}' "$tmpfile"

User expired's avatar
User expired committed
218
219
220
        # Long dash without spaces (permits journal abbreviation).
        sed -i '/^  Journal = {/s/ - /--/g' "$tmpfile"

221
222
        # Remove url and eprint if doi exists.
        # Otherwise, try to obtain doi from url.
223
224
225
226
227
        { rm "$tmpfile" && awk 'BEGIN {
            RS = "@"
            FS = "\n"
            ORS = RS
            OFS = FS
228
229
230
231
        }
        {
            # Check if Doi is present.
            if ($0 ~ "\n  Doi = {.*\\..*}") {
232
                hasdoi = 1
233
            } else {
234
                hasdoi = 0
235
236
237
238
            }
            # Print record separator.
            if (NR > 1) { printf("%s", ORS) }
            # Iterate fields.
239
            for (i = 1; i <= NF; i++) {
240
241
                if (!hasdoi && ($i ~ "^  Url = .*doi.org")) {
                    # Use Url as Doi field.
242
                    sub("^  Url", "  Doi", $i)
243
244
245
246
247
248
249
250
251
252
                }
                if (hasdoi && ($i ~ "^  (Eprint|Url) = .*}[, ]*$")) {
                    # Remove Eprint and Url if Doi is present.
                } else {
                    # Print delimiter as required and field.
                    if (i > 1) { printf("%s", OFS) }
                    printf("%s", $i)
                }
            }
        }' > "$tmpfile"; } < "$tmpfile"
253

254
        # Remove html part or 'doi:', 'DOI ' prefix from doi.
255
        sed -i '/^  Doi = .*doi.org/s_[htps:/dx.]*doi.org/__' "$tmpfile"
User expired's avatar
User expired committed
256
        sed -i '/^  Doi = .*doi:/s_doi:__' "$tmpfile"
257
        sed -i '/^  Doi = .*DOI /s_DOI __' "$tmpfile"
258

259
260
    fi

261
    # Remove emptry entries.
262
    sed -i '/^  [A-Za-z][A-Za-z_-]* = ,$/d' "$tmpfile"
263

264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
    # Exceptions from title case keys (at last, title case is expected above).
    sed -i -r 's/^  Issn = /  ISSN = /' "$tmpfile"

    # Format whitespace like JabRef (at last, single spacing is expected above).
    awktmp="$(mktemp)"
    awk -F' = ' '/ = /{
            printf("%-26s", $1)
            for (i=2; i<=NF; i++) {
                printf("%s%s", FS, $i)
            }
            printf("\n")
        }
        !/ = /{print $0}' "$tmpfile" > "$awktmp"
    mv "$awktmp" "$tmpfile"

    # In case of changes update source file or print diff.
    if ! diff -q "$bibfile" "$tmpfile" >/dev/null
    then
        if [ $INPLACE -eq 1 ]
        then
            mv "$tmpfile" "$bibfile"
        else
            echo "=== changes for »$bibfile« ==="
            diff "$bibfile" "$tmpfile"
        fi
    fi

    # Remove temporary copy if not moved previously.
    test ! -e "$tmpfile" || rm "$tmpfile"
done