diff --git a/ent/parsed/flatten.go b/ent/parsed/flatten.go new file mode 100644 index 0000000..0d45def --- /dev/null +++ b/ent/parsed/flatten.go @@ -0,0 +1,345 @@ +package parsed + +import "strings" + +// ParsedFlat is the result of a scientific name-string parsing flattened +// for the convenience. +type ParsedFlat struct { + // Parsed is false if parsing did not succeed. + Parsed bool `json:"parsed"` + + // NomCode modifies parsing rules according to provided nomenclatural code. + NomCode string `json:"nomenclaturalCode,omitempty"` + + // ParseQuality is a number that represents the quality of the + // parsing. + // + // 0 - name-string is not parseable + // 1 - no parsing problems encountered + // 2 - small parsing problems + // 3 - serious parsing problems + // 4 - severe problems, name could not be parsed completely + // + // The ParseQuality is equal to the quality of the most + // severe warning (see qualityWarnings). If no problems + // are encountered, and the parsing succeeded, the parseQuality + // is set to 1. If parsing failed, the parseQuality is 0. + ParseQuality int `json:"quality"` + + // Verbatim is input name-string without modifications. + Verbatim string `json:"verbatim"` + + // Normalized is a normalized version of the input name-string. + Normalized string `json:"normalized,omitempty"` + + // CanonicalSimple is a simplified version of a name where some elements like ranks, + // or hybrid signs "×" are omitted (hybrid signs are present for hybrid + // formulas). + // + // It is most useful to match names in general. + CanonicalSimple string `json:"canonicalSimple,omitempty"` + + // CanonicalFull is a canonical form that keeps hybrid signs "×" for named + // hybrids and shows infra-specific ranks. + // + // It is most useful for detection of the best matches from + // multiple results. It is also recommended for displaying + // canonical forms of botanical names. + CanonicalFull string `json:"canonicalFull,omitempty"` + + // CanonicalStemmed is the most "normalized" and simplified version of the name. + // Species epithets are stripped of suffixes, "j" character converted to "i", + // "v" character converted to "u" according to "Schinke R, Greengrass M, + // Robertson AM and Willett P (1996)" + // + // It is most useful to match names when a variability in suffixes is + // possible. + CanonicalStemmed string `json:"canonicalStemmed,omitempty"` + + // Cardinality allows to sort, partition names according to number of + // elements in their canonical forms. + // + // 0 - cardinality cannot be calculated + // 1 - uninomial + // 2 - binomial + // 3 - trinomial + // 4 - quadrinomial + Cardinality int `json:"cardinality"` + + // Rank provides information about the rank of the name. It is not + // always possible to infer rank correctly, so this field will be + // omitted when the data for it does not exist. + Rank string `json:"rank,omitempty"` + + // Authorship is the verbatim authorship of the name. + Authorship string `json:"authorship,omitempty"` + + // Bacteria is not nil if the input name has a genus + // that is registered as bacterial. Possible + // values are "maybe" - if the genus has homonyms in other groups + // and "yes" if GNparser dictionary does not detect any homonyms + // + // The bacterial names often contain strain information which are + // not parseable and are placed into the "tail" field. + Bacteria string `json:"bacteria,omitempty"` + + // Candidatus indicates that the parsed string is a candidatus bacterial name. + Candidatus bool `json:"candidatus,omitempty"` + + // Virus is set to true in case if name is not parsed, and probably + // belongs to a wide variety of sub-cellular entities like + // + // - viruses + // - plasmids + // - prions + // - RNA + // - DNA + // + // Viruses are the vast majority in this group of names, + // as a result they gave (very imprecise) name to + // the field. + // + // We do plan to create a parser for viruses at some point, + // which will expand this group into more precise categories. + Virus bool `json:"virus,omitempty"` + + // Cultivar is true if a name was parsed as a cultivar. + Cultivar bool `json:"cultivar,omitempty"` + + // DaggerChar if true if a name-string includes '†' rune. + // This rune might mean a fossil, or be indication of the clade extinction. + DaggerChar bool `json:"daggerChar,omitempty"` + + // Hybrid is a string representation of a hybrid type. + // + // - a non-categorized hybrid + // - named hybrid + // - notho- hybrid + // - hybrid formula + Hybrid string `json:"hybrid,omitempty"` + + // GraftChimera is a string representation of graft chimera. + // + // - a non-categorized graft chimera + // - named graft chimera + // - graft chimera formula + GraftChimera string `json:"graftchimera,omitempty"` + + // Surrogate is a string repsresentation of a surrogate type. + + // - a non-categorized surrogates + // - surrogate names from BOLD project + // - comparisons (Homo cf. sapiens) + // - approximations (names for specimen that not fully identified) + Surrogate string `json:"surrogate,omitempty"` + + // Tail is an unparseable tail of a name. It might contain "junk", + // annotations, malformed parts of a scientific name, taxonomic concept + // indications, bacterial strains etc. If there is an unparseable tail, the + // quality of the name-parsing is set to the worst category. + Tail string `json:"tail,omitempty"` + + // Uninomial represents the single name used for uninomial nomenclature, + // typically applied to higher taxonomic ranks (e.g., family or order names + // like "Asteraceae"). This field is populated only for uninomial names and + // omitted otherwise. + Uninomial string `json:"uninomial,omitempty"` + + // Genus specifies the genus part of a binomial or trinomial scientific name + // (e.g., "Quercus" in "Quercus robur"). This field is empty if the name is + // uninomial. + Genus string `json:"genus,omitempty"` + + // Subgenus indicates the infrageneric epithet when present. + // This field is omitted if not applicable. + Subgenus string `json:"infragenericEpithet,omitempty"` + + // Species is the specific epithet of a binomial or trinomial. + Species string `json:"specificEpithet,omitempty"` + + // Infraspecies is the infraspecificEpither of trinomials (names with + // cardinality 3). We do not provide details for names with higher + // cardinality. + Infraspecies string `json:"infraspecificEpithet,omitempty"` + + // CultivarEpithet contains the cultivar name for cultivated plant varieties + // (e.g., "Golden Delicious" in "Malus domestica 'Golden Delicious'"). This + // field is populated only for names that include a cultivar designation. + CultivarEpithet string `json:"cultivarEpithet,omitempty"` + + // Notho denotes the hybrid status of a name, indicating whether it is a + // hybrid (e.g., "nothosubsp." or "nothovar." in "Salvia × sylvestris"). This + // field is empty if not given. + Notho string `json:"notho,omitempty"` + + // CombinationAuthorship provides the authorship for the current combination + // of the name, typically the authors who transferred the species to a new + // genus. (e.g., "K." in "Aus bus (L.) K."). This field is + // omitted if no combination authorship is specified. + CombinationAuthorship string `json:"combinationAuthorship,omitempty"` + + // CombinationExAuthorship captures the "ex" part of the combination + // authorship (e.g., "ex DC." in "Quercus robur L. ex DC."). This field is + // empty if no "ex" authorship exists. + CombinationExAuthorship string `json:"combinationExAuthorship,omitempty"` + + // CombinationAuthorshipYear records the year associated with the combination + // authorship, if provided (e.g., "1754" in "Homo sapiens (L.) K. 1753"). + // This field is omitted if the year is not specified. + CombinationAuthorshipYear string `json:"combinationAuthorshipYear,omitempty"` + + // BasionymAuthorship identifies the authorship of the original combination + // of the name (e.g., "Mill." in "Quercus robur (Mill.) L." where Mill. is + // the original author). This field is populated only if basionym authorship + // is present. + BasionymAuthorship string `json:"basionymAuthorship,omitempty"` + + // BasionymExAuthorship specifies the "ex" part of the basionym authorship, + // if applicable (e.g., "ex Torr." in "Pinus ponderosa Douglas ex Torr."). + // This field is empty when no "ex" basionym authorship is provided. + BasionymExAuthorship string `json:"basionymExAuthorship,omitempty"` + + // BasionymAuthorshipYear indicates the year tied to the basionym authorship + // (e.g., "1820" in "Pinus ponderosa Douglas, 1820"). This field is included + // only when the basionym year is explicitly stated. + BasionymAuthorshipYear string `json:"basionymAuthorshipYear,omitempty"` + + // VerbatimID is a UUID v5 generated from the verbatim value of the + // input name-string. Every unique string always generates the same + // UUID. + VerbatimID string `json:"id"` + + // ParserVersion is the version number of the GNparser. + ParserVersion string `json:"parserVersion"` +} + +// Flatten converts a Parsed struct into a ParsedFlat struct, which is a +// flattened representation of the parsed data. +func (p Parsed) Flatten() ParsedFlat { + var bact string + if p.Bacteria != nil && p.Bacteria.Valid { + switch p.Bacteria.Value { + case 0: + bact = "maybe" + case 1: + bact = "yes" + default: + bact = "no" + } + } + var hybrid string + if p.Hybrid != nil { + hybrid = p.Hybrid.String() + } + var graft string + if p.GraftChimera != nil { + graft = p.GraftChimera.String() + } + var surrogate string + if p.Surrogate != nil { + surrogate = p.Surrogate.String() + } + + res := ParsedFlat{ + Parsed: p.Parsed, + NomCode: p.NomCode, + ParseQuality: p.ParseQuality, + Verbatim: p.Verbatim, + Normalized: p.Normalized, + Cardinality: p.Cardinality, + Rank: p.Rank, + Bacteria: bact, + Candidatus: p.Candidatus, + Virus: p.Virus, + Cultivar: p.Cultivar, + DaggerChar: p.DaggerChar, + Hybrid: hybrid, + GraftChimera: graft, + Surrogate: surrogate, + Tail: p.Tail, + VerbatimID: p.VerbatimID, + ParserVersion: p.ParserVersion, + } + if !p.Parsed { + return res + } + + res.CanonicalSimple = p.Canonical.Simple + res.CanonicalFull = p.Canonical.Full + res.CanonicalStemmed = p.Canonical.Stemmed + + if p.Authorship != nil { + au := p.Authorship + res.Authorship = au.Verbatim + + if au.Original != nil { + res.BasionymAuthorship = authorship(au.Original) + res.BasionymExAuthorship = exAuthorship(au.Original) + res.BasionymAuthorshipYear = year(au.Original) + } + + if au.Combination != nil { + res.CombinationAuthorship = authorship(au.Combination) + res.CombinationExAuthorship = exAuthorship(au.Combination) + res.CombinationAuthorshipYear = year(au.Combination) + } + } + + switch detail := p.Details.(type) { + case DetailsUninomial: + res.Uninomial = detail.Uninomial.Value + case DetailsSpecies: + res.Genus = detail.Species.Genus + res.Subgenus = detail.Species.Subgenus + res.Species = detail.Species.Species + case DetailsInfraspecies: + if len(detail.Infraspecies.Infraspecies) == 1 { + res.Genus = detail.Infraspecies.Genus + res.Species = detail.Infraspecies.Species.Species + res.Rank = detail.Infraspecies.Infraspecies[0].Rank + res.Infraspecies = detail.Infraspecies.Infraspecies[0].Value + } + res.BasionymAuthorshipYear = year(detail.Infraspecies.Authorship.Original) + } + return res +} + +func authorship(ag *AuthGroup) string { + if ag == nil { + return "" + } + return joinAuthors(ag.Authors) +} + +func joinAuthors(aus []string) string { + var res string + switch len(aus) { + case 0: + res = "" + case 1: + res = aus[0] + case 2: + res = strings.Join(aus, " & ") + default: + res = strings.Join(aus[0:len(aus)-1], ", ") + res = res + " & " + aus[len(aus)-1] + } + return res +} + +func exAuthorship(ag *AuthGroup) string { + if ag == nil || ag.ExAuthors == nil { + return "" + } + return joinAuthors(ag.ExAuthors.Authors) +} + +func year(ag *AuthGroup) string { + if ag == nil || ag.Year == nil { + return "" + } + if ag.Year.IsApproximate { + return "(" + ag.Year.Value + ")" + } + return ag.Year.Value +} diff --git a/ent/parsed/flatten_test.go b/ent/parsed/flatten_test.go new file mode 100644 index 0000000..147ccbd --- /dev/null +++ b/ent/parsed/flatten_test.go @@ -0,0 +1,119 @@ +package parsed_test + +import ( + "testing" + + "github.com/gnames/gnparser/ent/parsed" + "github.com/stretchr/testify/assert" +) + +func TestFlatten(t *testing.T) { + tests := []struct { + name string + input parsed.Parsed + expected parsed.ParsedFlat + }{ + { + name: "Parsed with all fields", + input: parsed.Parsed{ + Parsed: true, + NomCode: "ICZN", + ParseQuality: 1, + Verbatim: "Aus bus", + Normalized: "Aus bus", + Cardinality: 2, + Rank: "species", + Bacteria: nil, + Candidatus: true, + Virus: false, + Cultivar: false, + DaggerChar: false, + Hybrid: nil, + GraftChimera: nil, + Surrogate: nil, + Tail: "tail", + VerbatimID: "12345", + ParserVersion: "1.0.0", + Canonical: &parsed.Canonical{ + Simple: "Aus bus", + Full: "Aus bus", + Stemmed: "Aus bus", + }, + Authorship: &parsed.Authorship{ + Verbatim: "L.", + Original: &parsed.AuthGroup{ + Authors: []string{"Linnaeus"}, + Year: &parsed.Year{Value: "1758"}, + }, + Combination: &parsed.AuthGroup{ + Authors: []string{"Smith"}, + Year: &parsed.Year{Value: "1800"}, + }, + }, + Details: parsed.DetailsSpecies{ + Species: parsed.Species{ + Genus: "Aus", + Species: "bus", + }, + }, + }, + expected: parsed.ParsedFlat{ + Parsed: true, + NomCode: "ICZN", + ParseQuality: 1, + Verbatim: "Aus bus", + Normalized: "Aus bus", + Cardinality: 2, + Rank: "species", + Bacteria: "", + Candidatus: true, + Virus: false, + Cultivar: false, + DaggerChar: false, + Hybrid: "", + GraftChimera: "", + Surrogate: "", + Tail: "tail", + VerbatimID: "12345", + ParserVersion: "1.0.0", + CanonicalSimple: "Aus bus", + CanonicalFull: "Aus bus", + CanonicalStemmed: "Aus bus", + Authorship: "L.", + BasionymAuthorship: "Linnaeus", + BasionymAuthorshipYear: "1758", + CombinationAuthorship: "Smith", + CombinationAuthorshipYear: "1800", + Genus: "Aus", + Subgenus: "", + Species: "bus", + }, + }, + { + name: "Parsed with minimal fields", + input: parsed.Parsed{ + Parsed: false, + NomCode: "ICZN", + ParseQuality: 0, + Verbatim: "Unknown", + VerbatimID: "67890", + ParserVersion: "1.0.0", + }, + expected: parsed.ParsedFlat{ + Parsed: false, + NomCode: "ICZN", + ParseQuality: 0, + Verbatim: "Unknown", + VerbatimID: "67890", + ParserVersion: "1.0.0", + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := tt.input.Flatten() + assert.Equal(t, tt.expected, result) + }) + } +} diff --git a/ent/parsed/interface.go b/ent/parsed/interface.go index 0494111..46f5d6c 100644 --- a/ent/parsed/interface.go +++ b/ent/parsed/interface.go @@ -3,6 +3,6 @@ package parsed // Details is a placeholder interface that allows to unify details of // various name types. type Details interface { - // isDetails is a placeholder method. + // isDetails is a placeholder method. isDetails() } diff --git a/go.mod b/go.mod index bce29f3..8c6131f 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/gnames/gnparser -go 1.23.5 +go 1.23.6 require ( github.com/dustin/go-humanize v1.0.1