Skip to content

Commit

Permalink
Respect files without styles (#9)
Browse files Browse the repository at this point in the history
* Respect files without styles

fix

* fix
  • Loading branch information
velios authored Nov 6, 2023
1 parent 11342a8 commit 5137a65
Show file tree
Hide file tree
Showing 6 changed files with 81 additions and 64 deletions.
10 changes: 8 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
# Changelog

0.0.8 / 2023-11-03
------------------
- Changed
- Fix parse xlsx without xl/styles.xml
- Refactoring and optimizations

0.0.7 / 2023-10-27
------------------
- Changed
- Fix parse xlsx without xl/sharedStrings.xml
- Fix parse cell with number and inlineStr data types
- Fix parse xlsx without xl/sharedStrings.xml
- Fix parse cell with number and inlineStr data types

0.0.6 / 2023-09-16
------------------
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ In the first example below we pull data from a specific sheet within the workboo
#!/usr/bin/env bb
(require '[babashka.deps :as deps])
(deps/add-deps
'{:deps {com.github.kbosompem/bb-excel {:mvn/version "0.0.6"}}})
'{:deps {com.github.kbosompem/bb-excel {:mvn/version "0.0.8"}}})

(ns demo
(:require [clojure.java.io :as io]
Expand Down
4 changes: 2 additions & 2 deletions bb-excel
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

(require '[babashka.deps :as deps])

(deps/add-deps '{:deps {com.github.kbosompem/bb-excel {:mvn/version "0.0.6"}}})
(deps/add-deps '{:deps {com.github.kbosompem/bb-excel {:mvn/version "0.0.8"}}})

(ns bb-excel
(:require [bb-excel.core :refer [get-sheets get-range get-sheet]]
Expand Down Expand Up @@ -115,7 +115,7 @@
(defn help
"Command line options"
[summary]
(->> ["bb-excel 0.0.6"
(->> ["bb-excel 0.0.8"
""
"Usage: bb-excel input-file options"
""
Expand Down
4 changes: 2 additions & 2 deletions bbexcel
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

(require '[babashka.deps :as deps])

(deps/add-deps '{:deps {com.github.kbosompem/bb-excel {:mvn/version "0.0.6"}}})
(deps/add-deps '{:deps {com.github.kbosompem/bb-excel {:mvn/version "0.0.8"}}})

(ns bbexcel
(:require [bb-excel.core :refer [get-sheets get-range get-sheet]]
Expand Down Expand Up @@ -115,7 +115,7 @@
(defn help
"Command line options"
[summary]
(->> ["bbexcel 0.0.6"
(->> ["bbexcel 0.0.8"
""
"Usage: bbexcel input-file options"
""
Expand Down
2 changes: 1 addition & 1 deletion project.clj
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
(defproject com.github.kbosompem/bb-excel "0.0.7"
(defproject com.github.kbosompem/bb-excel "0.0.8"
:description "A Simple Clojure/Babashka Library for Reading Data from Excel Files"
:url "https://github.com/kbosompem/bb-excel"
:license {:name "EPL-2.0"
Expand Down
123 changes: 67 additions & 56 deletions src/bb_excel/core.clj
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
(ns bb-excel.core
(:require [clojure.data.xml :refer [parse-str]]
(:require [clojure.string :as str]
[clojure.data.xml :refer [parse-str]]
[clojure.java.io :as io]
[clojure.set :refer [rename-keys]])
(:import [java.io File]
Expand Down Expand Up @@ -67,6 +68,11 @@
:xmlns.http%3A%2F%2Fschemas.openxmlformats.org%2FofficeDocument%2F2006%2Frelationships/id
:xmlns.http%3A%2F%2Fpurl.oclc.org%2Fooxml%2FofficeDocument%2Frelationships/id}})

(def ^:const SHEET_TAG_TAGS (:sheet-tag tags))
(def ^:const TEXT_PART_TAGS (:text-part tags))
(def ^:const TEXT_T_TAGS (:text-t tags))
(def ^:const SHEET_DATA_TAGS (:sheet-data tags))

(defn- zipfile-or-nil
"Retrieve ZipFile object if provided `file-or-filename` point to existing file or nil"
[file-or-filename]
Expand All @@ -85,7 +91,7 @@
(let [wb (.getEntry zipfile "xl/workbook.xml")
ins (.getInputStream zipfile wb)
x (parse-str (slurp ins))
y (filter #((:sheet-tag tags) (:tag %)) (xml-seq x))]
y (filter #(contains? SHEET_TAG_TAGS (:tag %)) (xml-seq x))]
(->> y
first
:content
Expand Down Expand Up @@ -132,68 +138,73 @@

(defn process-cell
"Process Excel cell"
[dict styles coll]
(let [[[_ row col]] (re-seq #"([A-Z]*)([0-9]+)" (:r coll))
u (-> coll
(assoc :x row)
(assoc :y col))]
[shared-strings styles cell]
(let [[_ row-index col-index] (re-matches #"([A-Z]+)([0-9]+)" (:r cell))
cell* (merge cell
{:x row-index
:y col-index})
cell-type (:t cell*)
cell-value (:d cell*)]
(cond
;; Possible data types well explained here https://stackoverflow.com/a/18346273
(= (:t u) "s") (dissoc (assoc-in u [:d] (dict (read-string (:d u)))) :t)
(= (:t u) "str") (dissoc u :t)
(= (:t u) "inlineStr") (dissoc u :t)
(= (:t u) "b") (dissoc (assoc-in u [:d] (if (= "1" (:d u)) true false)) :t)
(= (:t u) "e") (assoc-in u [:d] (error-codes (:d u)))
(= (:t u) "n") (assoc u :d (parse-long (:d u)))
(style-check u styles pcts) (assoc-in u [:d] (num2pct (:d u)))
(style-check u styles dates) (assoc-in u [:d] (num2date (:d u)))
(style-check u styles times) (assoc-in u [:d] (num2time (:d u)))
:else u)))
;; Possible cell-value types well explained here https://stackoverflow.com/a/18346273
(= cell-type "s") (assoc cell* :d (nth shared-strings (parse-long cell-value)))
(= cell-type "str") cell*
(= cell-type "inlineStr") cell*
(= cell-type "b") (assoc cell* :d (if (= "1" cell-value) true false))
(= cell-type "e") (assoc cell* :d (get error-codes cell-value))
(= cell-type "n") (assoc cell* :d (parse-long cell-value))
(style-check cell* styles pcts) (assoc cell* :d (num2pct cell-value))
(style-check cell* styles dates) (assoc cell* :d (num2date cell-value))
(style-check cell* styles times) (assoc cell* :d (num2time cell-value))
:else cell*)))

(defn- get-row-index
[row]
(parse-long (:y row)))

(defn process-row
"Process Excel row of data"
[dict styles coll]
(reduce #(merge % {:_r (read-string (:y %2)) (keyword (:x %2)) (:d %2)}) {}
(map (partial process-cell dict styles)
(map #(merge (first %) {:d (second %)}
{:f (nth % 2)})
(map (juxt :attrs
(comp last :content last :content)
(comp first :content first :content)) coll)))))
[shared-strings styles row]
(let [row* (->> row
(map (fn [cell] (merge (:attrs cell)
{:d (-> cell :content last :content last)})))
(map (partial process-cell shared-strings styles)))
row-index (get-row-index (first row*))]
(into {:_r row-index} (map #(-> [(keyword (:x %)) (:d %)]))
row*)))

(defn- get-cell-text
"Extract "
[coll]
(apply str
(mapcat :content
(filter #((:text-t tags) (:tag %))
(xml-seq coll)))))

(defn get-unique-strings
"Extract text from cell"
[cell]
(->> (xml-seq cell)
(filter #(contains? TEXT_T_TAGS (:tag %)))
(mapcat :content)
(str/join)))

(defn get-shared-strings
"Get dictionary of all unique strings in the Excel spreadsheet"
[^ZipFile zipfile]
(if-let [wb (.getEntry zipfile (str "xl/sharedStrings.xml"))]
(if-let [wb (.getEntry zipfile "xl/sharedStrings.xml")]
(let [ins (.getInputStream zipfile wb)
x (parse-str (slurp ins))]
(->>
(filter #((:text-part tags) (:tag %)) (xml-seq x))
(map get-cell-text)
(zipmap (range))))
{}))
(into [] (comp (filter #(contains? TEXT_PART_TAGS (:tag %)))
(map get-cell-text))
(xml-seq x)))
[]))

(defn get-styles
"Get styles"
[^ZipFile zipfile]
(let [wb (.getEntry zipfile (str "xl/styles.xml"))
ins (.getInputStream zipfile wb)
x (parse-str (slurp ins))]
(->> x
xml-seq
(filter #((:cellxfs tags) (:tag %)))
first
:content
(filter #((:xf tags) (:tag %)))
(mapv (comp :numFmtId :attrs)))))
(if-let [wb (.getEntry zipfile "xl/styles.xml")]
(let [ins (.getInputStream zipfile wb)
x (parse-str (slurp ins))]
(->> (xml-seq x)
(filter #((:cellxfs tags) (:tag %)))
first
:content
(filter #((:xf tags) (:tag %)))
(mapv (comp :numFmtId :attrs))))
[]))

(defn get-sheet
"Get sheet from file or filename"
Expand Down Expand Up @@ -222,16 +233,16 @@
(throw (ex-info message {}))))
wb (.getEntry zipfile (str "xl/worksheets/sheet" sheetid ".xml"))
ins (.getInputStream zipfile wb)
dict (get-unique-strings zipfile)
shared-strings (get-shared-strings zipfile)
styles (get-styles zipfile)
xx (slurp ins)
x (parse-str xx)
d (->> x :content
(filter #((:sheet-data tags) (:tag %)))
d (->> (:content x)
(filter #(contains? SHEET_DATA_TAGS (:tag %)))
first :content
(map :content)
(take rows)
(map (partial process-row dict styles)))
(map (partial process-row shared-strings styles)));
dx (remove #(= row (:_r %)) d)
h (when hdr (merge (update-vals (first (filter #(= (:_r %) row) d)) fxn) {:_r :_r}))
dy (if (pos? rows)
Expand Down Expand Up @@ -274,7 +285,7 @@
(defn parse-range
"Takes in an Excel coordinate and returns a hashmap of rows and columns to pull"
[s]
(let [[[_ osc osr oec oer]] (re-seq #"([A-Z]+)([0-9]*)[:]?([A-Z]*)([0-9]*)" s)
(let [[_ osc osr oec oer] (re-matches #"([A-Z]+)([0-9]*)[:]?([A-Z]*)([0-9]*)" s)
sc (or osc "A")
ec (or (when-str oec) (when-str osc) sc)
sr (or (when-num osr) 1)
Expand Down Expand Up @@ -315,7 +326,7 @@
"Get range of values returned as list of rows"
[sheet rows cols]
(map #(select-keys % cols)
(filter #((set rows) (:_r %)) sheet)))
(filter #(contains? (set rows) (:_r %)) sheet)))

(defn get-range
"Get range of values using Excel cell coordinates
Expand Down

0 comments on commit 5137a65

Please sign in to comment.