From b526ade0452ece664c704b3c6c2230a74ed09e60 Mon Sep 17 00:00:00 2001 From: Dmitry Mozzherin Date: Wed, 5 Mar 2025 22:13:40 +0000 Subject: [PATCH 1/4] create flatenned output for details (close #283) --- ent/parsed/parsed.go | 183 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 183 insertions(+) diff --git a/ent/parsed/parsed.go b/ent/parsed/parsed.go index c4217a0..b92523e 100644 --- a/ent/parsed/parsed.go +++ b/ent/parsed/parsed.go @@ -144,6 +144,189 @@ type Parsed struct { ParserVersion string `json:"parserVersion"` } +// ParsedFlat is the result of a scientific name-string parsing flattened +// for the convenience. +type ParsedFlat struct { + // Parsed is false if parsing did not succeed. + Parsed bool `json:"parsed"` + + // NomCode modifies parsing rules according to provided nomenclatural code. + NomCode string `json:"nomenclaturalCode,omitempty"` + + // ParseQuality is a number that represents the quality of the + // parsing. + // + // 0 - name-string is not parseable + // 1 - no parsing problems encountered + // 2 - small parsing problems + // 3 - serious parsing problems + // 4 - severe problems, name could not be parsed completely + // + // The ParseQuality is equal to the quality of the most + // severe warning (see qualityWarnings). If no problems + // are encountered, and the parsing succeeded, the parseQuality + // is set to 1. If parsing failed, the parseQuality is 0. + ParseQuality int `json:"quality"` + + // Verbatim is input name-string without modifications. + Verbatim string `json:"verbatim"` + + // Normalized is a normalized version of the input name-string. + Normalized string `json:"normalized,omitempty"` + + // Canonical are simplified versions of a name-string more suitable for + // matching and comparing name-strings than the verbatim version. + CanonicalSimple string `json:"canonical,omitempty"` + + // Cardinality allows to sort, partition names according to number of + // elements in their canonical forms. + // + // 0 - cardinality cannot be calculated + // 1 - uninomial + // 2 - binomial + // 3 - trinomial + // 4 - quadrinomial + Cardinality int `json:"cardinality"` + + // Rank provides information about the rank of the name. It is not + // always possible to infer rank correctly, so this field will be + // omitted when the data for it does not exist. + Rank string `json:"rank,omitempty"` + + // Authorship is the verbatim authorship of the name. + Authorship string `json:"authorship,omitempty"` + + // Bacteria is not nil if the input name has a genus + // that is registered as bacterial. Possible + // values are "maybe" - if the genus has homonyms in other groups + // and "yes" if GNparser dictionary does not detect any homonyms + // + // The bacterial names often contain strain information which are + // not parseable and are placed into the "tail" field. + Bacteria *tb.Tribool `json:"bacteria,omitempty"` + + // Candidatus indicates that the parsed string is a candidatus bacterial name. + Candidatus bool `json:"candidatus,omitempty"` + + // Virus is set to true in case if name is not parsed, and probably + // belongs to a wide variety of sub-cellular entities like + // + // - viruses + // - plasmids + // - prions + // - RNA + // - DNA + // + // Viruses are the vast majority in this group of names, + // as a result they gave (very imprecise) name to + // the field. + // + // We do plan to create a parser for viruses at some point, + // which will expand this group into more precise categories. + Virus bool `json:"virus,omitempty"` + + // Cultivar is true if a name was parsed as a cultivar. + Cultivar bool `json:"cultivar,omitempty"` + + // DaggerChar if true if a name-string includes '†' rune. + // This rune might mean a fossil, or be indication of the clade extinction. + DaggerChar bool `json:"daggerChar,omitempty"` + + // Hybrid is a string representation of a hybrid type. + // + // - a non-categorized hybrid + // - named hybrid + // - notho- hybrid + // - hybrid formula + Hybrid string `json:"hybrid,omitempty"` + + // GraftChimera is a string representation of graft chimera. + // + // - a non-categorized graft chimera + // - named graft chimera + // - graft chimera formula + GraftChimera string `json:"graftchimera,omitempty"` + + // Surrogate is a string repsresentation of a surrogate type. + + // - a non-categorized surrogates + // - surrogate names from BOLD project + // - comparisons (Homo cf. sapiens) + // - approximations (names for specimen that not fully identified) + Surrogate string `json:"surrogate,omitempty"` + + // Tail is an unparseable tail of a name. It might contain "junk", + // annotations, malformed parts of a scientific name, taxonomic concept + // indications, bacterial strains etc. If there is an unparseable tail, the + // quality of the name-parsing is set to the worst category. + Tail string `json:"tail,omitempty"` + + // Uninomial represents the single name used for uninomial nomenclature, + // typically applied to higher taxonomic ranks (e.g., family or order names + // like "Asteraceae"). This field is populated only for uninomial names and + // omitted otherwise. + Uninomial string `json:"uninomial,omitempty"` + + // Genus specifies the genus part of a binomial or trinomial scientific name + // (e.g., "Quercus" in "Quercus robur"). This field is empty if the name is + // uninomial. + Genus string `json:"genus,omitempty"` + + // InfragenericEpithet indicates the infrageneric epithet when present. + // This field is omitted if not applicable. + InfragenericEpithet string `json:"infragenericEpithet,omitempty"` + + // CultivarEpithet contains the cultivar name for cultivated plant varieties + // (e.g., "Golden Delicious" in "Malus domestica 'Golden Delicious'"). This + // field is populated only for names that include a cultivar designation. + CultivarEpithet string `json:"cultivarEpithet,omitempty"` + + // Notho denotes the hybrid status of a name, indicating whether it is a + // hybrid (e.g., "nothosubsp." or "nothovar." in "Salvia × sylvestris"). This + // field is empty if not given. + Notho string `json:"notho,omitempty"` + + // CombinationAuthorship provides the authorship for the current combination + // of the name, typically the authors who transferred the species to a new + // genus. (e.g., "K." in "Aus bus (L.) K."). This field is + // omitted if no combination authorship is specified. + CombinationAuthorship string `json:"combinationAuthorship,omitempty"` + + // CombinationExAuthorship captures the "ex" part of the combination + // authorship (e.g., "ex DC." in "Quercus robur L. ex DC."). This field is + // empty if no "ex" authorship exists. + CombinationExAuthorship string `json:"combinationExAuthorship,omitempty"` + + // CombinationAuthorshipYear records the year associated with the combination + // authorship, if provided (e.g., "1753" in "Homo sapiens (L.) K. 1753"). + // This field is omitted if the year is not specified. + CombinationAuthorshipYear string `json:"combinationAuthorshipYear,omitempty"` + + // BasionymAuthorship identifies the authorship of the original combination + // of the name (e.g., "Mill." in "Quercus robur (Mill.) L." where Mill. is + // the original author). This field is populated only if basionym authorship + // is present. + BasionymAuthorship string `json:"basionymAuthorship,omitempty"` + + // BasionymExAuthorship specifies the "ex" part of the basionym authorship, + // if applicable (e.g., "ex Torr." in "Pinus ponderosa Douglas ex Torr."). + // This field is empty when no "ex" basionym authorship is provided. + BasionymExAuthorship string `json:"basionymExAuthorship,omitempty"` + + // BasionymAuthorshipYear indicates the year tied to the basionym authorship + // (e.g., "1820" in "Pinus ponderosa Douglas, 1820"). This field is included + // only when the basionym year is explicitly stated. + BasionymAuthorshipYear string `json:"basionymAuthorshipYear,omitempty"` + + // VerbatimID is a UUID v5 generated from the verbatim value of the + // input name-string. Every unique string always generates the same + // UUID. + VerbatimID string `json:"id"` + + // ParserVersion is the version number of the GNparser. + ParserVersion string `json:"parserVersion"` +} + // Canonical are simplified forms of a name-string more suitable for // matching and comparing name-strings than the verbatim version. type Canonical struct { From 70b031e7b34dd750e05b1c0e052d7b5d39e80aa8 Mon Sep 17 00:00:00 2001 From: Dmitry Mozzherin Date: Thu, 6 Mar 2025 00:44:08 +0000 Subject: [PATCH 2/4] wip --- ent/parsed/parsed.go | 90 ++++++++++++++++++++++++++++++++++++++++---- go.mod | 2 +- 2 files changed, 84 insertions(+), 8 deletions(-) diff --git a/ent/parsed/parsed.go b/ent/parsed/parsed.go index b92523e..fde8f44 100644 --- a/ent/parsed/parsed.go +++ b/ent/parsed/parsed.go @@ -3,6 +3,7 @@ package parsed import ( + "github.com/gnames/gnparser/ent/parsed" tb "github.com/gnames/tribool" ) @@ -174,9 +175,29 @@ type ParsedFlat struct { // Normalized is a normalized version of the input name-string. Normalized string `json:"normalized,omitempty"` - // Canonical are simplified versions of a name-string more suitable for - // matching and comparing name-strings than the verbatim version. - CanonicalSimple string `json:"canonical,omitempty"` + // CanonicalSimple is a simplified version of a name where some elements like ranks, + // or hybrid signs "×" are omitted (hybrid signs are present for hybrid + // formulas). + // + // It is most useful to match names in general. + CanonicalSimple string `json:"canonicalSimple,omitempty"` + + // CanonicalFull is a canonical form that keeps hybrid signs "×" for named + // hybrids and shows infra-specific ranks. + // + // It is most useful for detection of the best matches from + // multiple results. It is also recommended for displaying + // canonical forms of botanical names. + CanonicalFull string `json:"canonicalFull,omitempty"` + + // CanonicalStemmed is the most "normalized" and simplified version of the name. + // Species epithets are stripped of suffixes, "j" character converted to "i", + // "v" character converted to "u" according to "Schinke R, Greengrass M, + // Robertson AM and Willett P (1996)" + // + // It is most useful to match names when a variability in suffixes is + // possible. + CanonicalStemmed string `json:"canonicalStemmed,omitempty"` // Cardinality allows to sort, partition names according to number of // elements in their canonical forms. @@ -203,7 +224,7 @@ type ParsedFlat struct { // // The bacterial names often contain strain information which are // not parseable and are placed into the "tail" field. - Bacteria *tb.Tribool `json:"bacteria,omitempty"` + Bacteria string `json:"bacteria,omitempty"` // Candidatus indicates that the parsed string is a candidatus bacterial name. Candidatus bool `json:"candidatus,omitempty"` @@ -272,9 +293,17 @@ type ParsedFlat struct { // uninomial. Genus string `json:"genus,omitempty"` - // InfragenericEpithet indicates the infrageneric epithet when present. + // Subgenus indicates the infrageneric epithet when present. // This field is omitted if not applicable. - InfragenericEpithet string `json:"infragenericEpithet,omitempty"` + Subgenus string `json:"infragenericEpithet,omitempty"` + + // Species is the specific epithet of a binomial or trinomial. + Species string `json:"specificEpithet,omitempty"` + + // Infraspecies is the infraspecificEpither of trinomials (names with + // cardinality 3). We do not provide details for names with higher + // cardinality. + Infraspecies string `json:"infraspecificEpithet,omitempty"` // CultivarEpithet contains the cultivar name for cultivated plant varieties // (e.g., "Golden Delicious" in "Malus domestica 'Golden Delicious'"). This @@ -298,7 +327,7 @@ type ParsedFlat struct { CombinationExAuthorship string `json:"combinationExAuthorship,omitempty"` // CombinationAuthorshipYear records the year associated with the combination - // authorship, if provided (e.g., "1753" in "Homo sapiens (L.) K. 1753"). + // authorship, if provided (e.g., "1754" in "Homo sapiens (L.) K. 1753"). // This field is omitted if the year is not specified. CombinationAuthorshipYear string `json:"combinationAuthorshipYear,omitempty"` @@ -424,3 +453,50 @@ type Year struct { // a question mark "188?", by parentheses "(1888)". IsApproximate bool `json:"isApproximate,omitempty"` } + +func (p Parsed) Flatten() ParsedFlat { + res := ParsedFlat{ + Parsed: p.Parsed, + NomCode: p.NomCode, + ParseQuality: p.ParseQuality, + Verbatim: p.Verbatim, + Normalized: p.Normalized, + Cardinality: p.Cardinality, + Rank: p.Rank, + Bacteria: p.Bacteria.String(), + Candidatus: p.Candidatus, + Virus: p.Virus, + Cultivar: p.Cultivar, + DaggerChar: p.DaggerChar, + Hybrid: p.Hybrid.String(), + GraftChimera: p.GraftChimera.String(), + Surrogate: p.Surrogate.String(), + Tail: p.Tail, + VerbatimID: p.VerbatimID, + ParserVersion: p.ParserVersion, + } + if !p.Parsed { + return res + } + + res.CanonicalSimple = p.Canonical.Simple + res.CanonicalFull = p.Canonical.Full + res.CanonicalStemmed = p.Canonical.Stemmed + + switch detail := p.Details.(type) { + case parsed.DetailsUninomial: + res.Uninomial = detail.Uninomial.Value + case parsed.DetailsSpecies: + res.Genus = detail.Species.Genus + res.Subgenus = detail.Species.Subgenus + res.Species = detail.Species.Species + case parsed.DetailsInfraspecies: + if len(detail.Infraspecies.Infraspecies) == 1 { + res.Genus = detail.Infraspecies.Genus + res.Species = detail.Infraspecies.Species.Species + res.Rank = detail.Infraspecies.Infraspecies[0].Rank + res.Infraspecies = detail.Infraspecies.Infraspecies[0].Value + } + } + return res +} diff --git a/go.mod b/go.mod index bce29f3..8c6131f 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/gnames/gnparser -go 1.23.5 +go 1.23.6 require ( github.com/dustin/go-humanize v1.0.1 From 0424393abc5591c6bfd85a6b229dda1d18895413 Mon Sep 17 00:00:00 2001 From: Dmitry Mozzherin Date: Thu, 6 Mar 2025 10:36:32 +0000 Subject: [PATCH 3/4] wip --- ent/parsed/parsed.go | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/ent/parsed/parsed.go b/ent/parsed/parsed.go index fde8f44..d14fa5f 100644 --- a/ent/parsed/parsed.go +++ b/ent/parsed/parsed.go @@ -3,7 +3,6 @@ package parsed import ( - "github.com/gnames/gnparser/ent/parsed" tb "github.com/gnames/tribool" ) @@ -483,14 +482,20 @@ func (p Parsed) Flatten() ParsedFlat { res.CanonicalFull = p.Canonical.Full res.CanonicalStemmed = p.Canonical.Stemmed + if p.Authorship != nil { + au := p.Authorship + res.Authorship = au.Verbatim + + } + switch detail := p.Details.(type) { - case parsed.DetailsUninomial: + case DetailsUninomial: res.Uninomial = detail.Uninomial.Value - case parsed.DetailsSpecies: + case DetailsSpecies: res.Genus = detail.Species.Genus res.Subgenus = detail.Species.Subgenus res.Species = detail.Species.Species - case parsed.DetailsInfraspecies: + case DetailsInfraspecies: if len(detail.Infraspecies.Infraspecies) == 1 { res.Genus = detail.Infraspecies.Genus res.Species = detail.Infraspecies.Species.Species From 5187c10603eec286caeae5a2ca5cc94995ce1454 Mon Sep 17 00:00:00 2001 From: Dmitry Mozzherin Date: Thu, 6 Mar 2025 12:47:48 +0000 Subject: [PATCH 4/4] wip --- ent/parsed/flatten.go | 345 +++++++++++++++++++++++++++++++++++++ ent/parsed/flatten_test.go | 119 +++++++++++++ ent/parsed/interface.go | 2 +- ent/parsed/parsed.go | 264 ---------------------------- 4 files changed, 465 insertions(+), 265 deletions(-) create mode 100644 ent/parsed/flatten.go create mode 100644 ent/parsed/flatten_test.go diff --git a/ent/parsed/flatten.go b/ent/parsed/flatten.go new file mode 100644 index 0000000..0d45def --- /dev/null +++ b/ent/parsed/flatten.go @@ -0,0 +1,345 @@ +package parsed + +import "strings" + +// ParsedFlat is the result of a scientific name-string parsing flattened +// for the convenience. +type ParsedFlat struct { + // Parsed is false if parsing did not succeed. + Parsed bool `json:"parsed"` + + // NomCode modifies parsing rules according to provided nomenclatural code. + NomCode string `json:"nomenclaturalCode,omitempty"` + + // ParseQuality is a number that represents the quality of the + // parsing. + // + // 0 - name-string is not parseable + // 1 - no parsing problems encountered + // 2 - small parsing problems + // 3 - serious parsing problems + // 4 - severe problems, name could not be parsed completely + // + // The ParseQuality is equal to the quality of the most + // severe warning (see qualityWarnings). If no problems + // are encountered, and the parsing succeeded, the parseQuality + // is set to 1. If parsing failed, the parseQuality is 0. + ParseQuality int `json:"quality"` + + // Verbatim is input name-string without modifications. + Verbatim string `json:"verbatim"` + + // Normalized is a normalized version of the input name-string. + Normalized string `json:"normalized,omitempty"` + + // CanonicalSimple is a simplified version of a name where some elements like ranks, + // or hybrid signs "×" are omitted (hybrid signs are present for hybrid + // formulas). + // + // It is most useful to match names in general. + CanonicalSimple string `json:"canonicalSimple,omitempty"` + + // CanonicalFull is a canonical form that keeps hybrid signs "×" for named + // hybrids and shows infra-specific ranks. + // + // It is most useful for detection of the best matches from + // multiple results. It is also recommended for displaying + // canonical forms of botanical names. + CanonicalFull string `json:"canonicalFull,omitempty"` + + // CanonicalStemmed is the most "normalized" and simplified version of the name. + // Species epithets are stripped of suffixes, "j" character converted to "i", + // "v" character converted to "u" according to "Schinke R, Greengrass M, + // Robertson AM and Willett P (1996)" + // + // It is most useful to match names when a variability in suffixes is + // possible. + CanonicalStemmed string `json:"canonicalStemmed,omitempty"` + + // Cardinality allows to sort, partition names according to number of + // elements in their canonical forms. + // + // 0 - cardinality cannot be calculated + // 1 - uninomial + // 2 - binomial + // 3 - trinomial + // 4 - quadrinomial + Cardinality int `json:"cardinality"` + + // Rank provides information about the rank of the name. It is not + // always possible to infer rank correctly, so this field will be + // omitted when the data for it does not exist. + Rank string `json:"rank,omitempty"` + + // Authorship is the verbatim authorship of the name. + Authorship string `json:"authorship,omitempty"` + + // Bacteria is not nil if the input name has a genus + // that is registered as bacterial. Possible + // values are "maybe" - if the genus has homonyms in other groups + // and "yes" if GNparser dictionary does not detect any homonyms + // + // The bacterial names often contain strain information which are + // not parseable and are placed into the "tail" field. + Bacteria string `json:"bacteria,omitempty"` + + // Candidatus indicates that the parsed string is a candidatus bacterial name. + Candidatus bool `json:"candidatus,omitempty"` + + // Virus is set to true in case if name is not parsed, and probably + // belongs to a wide variety of sub-cellular entities like + // + // - viruses + // - plasmids + // - prions + // - RNA + // - DNA + // + // Viruses are the vast majority in this group of names, + // as a result they gave (very imprecise) name to + // the field. + // + // We do plan to create a parser for viruses at some point, + // which will expand this group into more precise categories. + Virus bool `json:"virus,omitempty"` + + // Cultivar is true if a name was parsed as a cultivar. + Cultivar bool `json:"cultivar,omitempty"` + + // DaggerChar if true if a name-string includes '†' rune. + // This rune might mean a fossil, or be indication of the clade extinction. + DaggerChar bool `json:"daggerChar,omitempty"` + + // Hybrid is a string representation of a hybrid type. + // + // - a non-categorized hybrid + // - named hybrid + // - notho- hybrid + // - hybrid formula + Hybrid string `json:"hybrid,omitempty"` + + // GraftChimera is a string representation of graft chimera. + // + // - a non-categorized graft chimera + // - named graft chimera + // - graft chimera formula + GraftChimera string `json:"graftchimera,omitempty"` + + // Surrogate is a string repsresentation of a surrogate type. + + // - a non-categorized surrogates + // - surrogate names from BOLD project + // - comparisons (Homo cf. sapiens) + // - approximations (names for specimen that not fully identified) + Surrogate string `json:"surrogate,omitempty"` + + // Tail is an unparseable tail of a name. It might contain "junk", + // annotations, malformed parts of a scientific name, taxonomic concept + // indications, bacterial strains etc. If there is an unparseable tail, the + // quality of the name-parsing is set to the worst category. + Tail string `json:"tail,omitempty"` + + // Uninomial represents the single name used for uninomial nomenclature, + // typically applied to higher taxonomic ranks (e.g., family or order names + // like "Asteraceae"). This field is populated only for uninomial names and + // omitted otherwise. + Uninomial string `json:"uninomial,omitempty"` + + // Genus specifies the genus part of a binomial or trinomial scientific name + // (e.g., "Quercus" in "Quercus robur"). This field is empty if the name is + // uninomial. + Genus string `json:"genus,omitempty"` + + // Subgenus indicates the infrageneric epithet when present. + // This field is omitted if not applicable. + Subgenus string `json:"infragenericEpithet,omitempty"` + + // Species is the specific epithet of a binomial or trinomial. + Species string `json:"specificEpithet,omitempty"` + + // Infraspecies is the infraspecificEpither of trinomials (names with + // cardinality 3). We do not provide details for names with higher + // cardinality. + Infraspecies string `json:"infraspecificEpithet,omitempty"` + + // CultivarEpithet contains the cultivar name for cultivated plant varieties + // (e.g., "Golden Delicious" in "Malus domestica 'Golden Delicious'"). This + // field is populated only for names that include a cultivar designation. + CultivarEpithet string `json:"cultivarEpithet,omitempty"` + + // Notho denotes the hybrid status of a name, indicating whether it is a + // hybrid (e.g., "nothosubsp." or "nothovar." in "Salvia × sylvestris"). This + // field is empty if not given. + Notho string `json:"notho,omitempty"` + + // CombinationAuthorship provides the authorship for the current combination + // of the name, typically the authors who transferred the species to a new + // genus. (e.g., "K." in "Aus bus (L.) K."). This field is + // omitted if no combination authorship is specified. + CombinationAuthorship string `json:"combinationAuthorship,omitempty"` + + // CombinationExAuthorship captures the "ex" part of the combination + // authorship (e.g., "ex DC." in "Quercus robur L. ex DC."). This field is + // empty if no "ex" authorship exists. + CombinationExAuthorship string `json:"combinationExAuthorship,omitempty"` + + // CombinationAuthorshipYear records the year associated with the combination + // authorship, if provided (e.g., "1754" in "Homo sapiens (L.) K. 1753"). + // This field is omitted if the year is not specified. + CombinationAuthorshipYear string `json:"combinationAuthorshipYear,omitempty"` + + // BasionymAuthorship identifies the authorship of the original combination + // of the name (e.g., "Mill." in "Quercus robur (Mill.) L." where Mill. is + // the original author). This field is populated only if basionym authorship + // is present. + BasionymAuthorship string `json:"basionymAuthorship,omitempty"` + + // BasionymExAuthorship specifies the "ex" part of the basionym authorship, + // if applicable (e.g., "ex Torr." in "Pinus ponderosa Douglas ex Torr."). + // This field is empty when no "ex" basionym authorship is provided. + BasionymExAuthorship string `json:"basionymExAuthorship,omitempty"` + + // BasionymAuthorshipYear indicates the year tied to the basionym authorship + // (e.g., "1820" in "Pinus ponderosa Douglas, 1820"). This field is included + // only when the basionym year is explicitly stated. + BasionymAuthorshipYear string `json:"basionymAuthorshipYear,omitempty"` + + // VerbatimID is a UUID v5 generated from the verbatim value of the + // input name-string. Every unique string always generates the same + // UUID. + VerbatimID string `json:"id"` + + // ParserVersion is the version number of the GNparser. + ParserVersion string `json:"parserVersion"` +} + +// Flatten converts a Parsed struct into a ParsedFlat struct, which is a +// flattened representation of the parsed data. +func (p Parsed) Flatten() ParsedFlat { + var bact string + if p.Bacteria != nil && p.Bacteria.Valid { + switch p.Bacteria.Value { + case 0: + bact = "maybe" + case 1: + bact = "yes" + default: + bact = "no" + } + } + var hybrid string + if p.Hybrid != nil { + hybrid = p.Hybrid.String() + } + var graft string + if p.GraftChimera != nil { + graft = p.GraftChimera.String() + } + var surrogate string + if p.Surrogate != nil { + surrogate = p.Surrogate.String() + } + + res := ParsedFlat{ + Parsed: p.Parsed, + NomCode: p.NomCode, + ParseQuality: p.ParseQuality, + Verbatim: p.Verbatim, + Normalized: p.Normalized, + Cardinality: p.Cardinality, + Rank: p.Rank, + Bacteria: bact, + Candidatus: p.Candidatus, + Virus: p.Virus, + Cultivar: p.Cultivar, + DaggerChar: p.DaggerChar, + Hybrid: hybrid, + GraftChimera: graft, + Surrogate: surrogate, + Tail: p.Tail, + VerbatimID: p.VerbatimID, + ParserVersion: p.ParserVersion, + } + if !p.Parsed { + return res + } + + res.CanonicalSimple = p.Canonical.Simple + res.CanonicalFull = p.Canonical.Full + res.CanonicalStemmed = p.Canonical.Stemmed + + if p.Authorship != nil { + au := p.Authorship + res.Authorship = au.Verbatim + + if au.Original != nil { + res.BasionymAuthorship = authorship(au.Original) + res.BasionymExAuthorship = exAuthorship(au.Original) + res.BasionymAuthorshipYear = year(au.Original) + } + + if au.Combination != nil { + res.CombinationAuthorship = authorship(au.Combination) + res.CombinationExAuthorship = exAuthorship(au.Combination) + res.CombinationAuthorshipYear = year(au.Combination) + } + } + + switch detail := p.Details.(type) { + case DetailsUninomial: + res.Uninomial = detail.Uninomial.Value + case DetailsSpecies: + res.Genus = detail.Species.Genus + res.Subgenus = detail.Species.Subgenus + res.Species = detail.Species.Species + case DetailsInfraspecies: + if len(detail.Infraspecies.Infraspecies) == 1 { + res.Genus = detail.Infraspecies.Genus + res.Species = detail.Infraspecies.Species.Species + res.Rank = detail.Infraspecies.Infraspecies[0].Rank + res.Infraspecies = detail.Infraspecies.Infraspecies[0].Value + } + res.BasionymAuthorshipYear = year(detail.Infraspecies.Authorship.Original) + } + return res +} + +func authorship(ag *AuthGroup) string { + if ag == nil { + return "" + } + return joinAuthors(ag.Authors) +} + +func joinAuthors(aus []string) string { + var res string + switch len(aus) { + case 0: + res = "" + case 1: + res = aus[0] + case 2: + res = strings.Join(aus, " & ") + default: + res = strings.Join(aus[0:len(aus)-1], ", ") + res = res + " & " + aus[len(aus)-1] + } + return res +} + +func exAuthorship(ag *AuthGroup) string { + if ag == nil || ag.ExAuthors == nil { + return "" + } + return joinAuthors(ag.ExAuthors.Authors) +} + +func year(ag *AuthGroup) string { + if ag == nil || ag.Year == nil { + return "" + } + if ag.Year.IsApproximate { + return "(" + ag.Year.Value + ")" + } + return ag.Year.Value +} diff --git a/ent/parsed/flatten_test.go b/ent/parsed/flatten_test.go new file mode 100644 index 0000000..147ccbd --- /dev/null +++ b/ent/parsed/flatten_test.go @@ -0,0 +1,119 @@ +package parsed_test + +import ( + "testing" + + "github.com/gnames/gnparser/ent/parsed" + "github.com/stretchr/testify/assert" +) + +func TestFlatten(t *testing.T) { + tests := []struct { + name string + input parsed.Parsed + expected parsed.ParsedFlat + }{ + { + name: "Parsed with all fields", + input: parsed.Parsed{ + Parsed: true, + NomCode: "ICZN", + ParseQuality: 1, + Verbatim: "Aus bus", + Normalized: "Aus bus", + Cardinality: 2, + Rank: "species", + Bacteria: nil, + Candidatus: true, + Virus: false, + Cultivar: false, + DaggerChar: false, + Hybrid: nil, + GraftChimera: nil, + Surrogate: nil, + Tail: "tail", + VerbatimID: "12345", + ParserVersion: "1.0.0", + Canonical: &parsed.Canonical{ + Simple: "Aus bus", + Full: "Aus bus", + Stemmed: "Aus bus", + }, + Authorship: &parsed.Authorship{ + Verbatim: "L.", + Original: &parsed.AuthGroup{ + Authors: []string{"Linnaeus"}, + Year: &parsed.Year{Value: "1758"}, + }, + Combination: &parsed.AuthGroup{ + Authors: []string{"Smith"}, + Year: &parsed.Year{Value: "1800"}, + }, + }, + Details: parsed.DetailsSpecies{ + Species: parsed.Species{ + Genus: "Aus", + Species: "bus", + }, + }, + }, + expected: parsed.ParsedFlat{ + Parsed: true, + NomCode: "ICZN", + ParseQuality: 1, + Verbatim: "Aus bus", + Normalized: "Aus bus", + Cardinality: 2, + Rank: "species", + Bacteria: "", + Candidatus: true, + Virus: false, + Cultivar: false, + DaggerChar: false, + Hybrid: "", + GraftChimera: "", + Surrogate: "", + Tail: "tail", + VerbatimID: "12345", + ParserVersion: "1.0.0", + CanonicalSimple: "Aus bus", + CanonicalFull: "Aus bus", + CanonicalStemmed: "Aus bus", + Authorship: "L.", + BasionymAuthorship: "Linnaeus", + BasionymAuthorshipYear: "1758", + CombinationAuthorship: "Smith", + CombinationAuthorshipYear: "1800", + Genus: "Aus", + Subgenus: "", + Species: "bus", + }, + }, + { + name: "Parsed with minimal fields", + input: parsed.Parsed{ + Parsed: false, + NomCode: "ICZN", + ParseQuality: 0, + Verbatim: "Unknown", + VerbatimID: "67890", + ParserVersion: "1.0.0", + }, + expected: parsed.ParsedFlat{ + Parsed: false, + NomCode: "ICZN", + ParseQuality: 0, + Verbatim: "Unknown", + VerbatimID: "67890", + ParserVersion: "1.0.0", + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := tt.input.Flatten() + assert.Equal(t, tt.expected, result) + }) + } +} diff --git a/ent/parsed/interface.go b/ent/parsed/interface.go index 0494111..46f5d6c 100644 --- a/ent/parsed/interface.go +++ b/ent/parsed/interface.go @@ -3,6 +3,6 @@ package parsed // Details is a placeholder interface that allows to unify details of // various name types. type Details interface { - // isDetails is a placeholder method. + // isDetails is a placeholder method. isDetails() } diff --git a/ent/parsed/parsed.go b/ent/parsed/parsed.go index d14fa5f..c4217a0 100644 --- a/ent/parsed/parsed.go +++ b/ent/parsed/parsed.go @@ -144,217 +144,6 @@ type Parsed struct { ParserVersion string `json:"parserVersion"` } -// ParsedFlat is the result of a scientific name-string parsing flattened -// for the convenience. -type ParsedFlat struct { - // Parsed is false if parsing did not succeed. - Parsed bool `json:"parsed"` - - // NomCode modifies parsing rules according to provided nomenclatural code. - NomCode string `json:"nomenclaturalCode,omitempty"` - - // ParseQuality is a number that represents the quality of the - // parsing. - // - // 0 - name-string is not parseable - // 1 - no parsing problems encountered - // 2 - small parsing problems - // 3 - serious parsing problems - // 4 - severe problems, name could not be parsed completely - // - // The ParseQuality is equal to the quality of the most - // severe warning (see qualityWarnings). If no problems - // are encountered, and the parsing succeeded, the parseQuality - // is set to 1. If parsing failed, the parseQuality is 0. - ParseQuality int `json:"quality"` - - // Verbatim is input name-string without modifications. - Verbatim string `json:"verbatim"` - - // Normalized is a normalized version of the input name-string. - Normalized string `json:"normalized,omitempty"` - - // CanonicalSimple is a simplified version of a name where some elements like ranks, - // or hybrid signs "×" are omitted (hybrid signs are present for hybrid - // formulas). - // - // It is most useful to match names in general. - CanonicalSimple string `json:"canonicalSimple,omitempty"` - - // CanonicalFull is a canonical form that keeps hybrid signs "×" for named - // hybrids and shows infra-specific ranks. - // - // It is most useful for detection of the best matches from - // multiple results. It is also recommended for displaying - // canonical forms of botanical names. - CanonicalFull string `json:"canonicalFull,omitempty"` - - // CanonicalStemmed is the most "normalized" and simplified version of the name. - // Species epithets are stripped of suffixes, "j" character converted to "i", - // "v" character converted to "u" according to "Schinke R, Greengrass M, - // Robertson AM and Willett P (1996)" - // - // It is most useful to match names when a variability in suffixes is - // possible. - CanonicalStemmed string `json:"canonicalStemmed,omitempty"` - - // Cardinality allows to sort, partition names according to number of - // elements in their canonical forms. - // - // 0 - cardinality cannot be calculated - // 1 - uninomial - // 2 - binomial - // 3 - trinomial - // 4 - quadrinomial - Cardinality int `json:"cardinality"` - - // Rank provides information about the rank of the name. It is not - // always possible to infer rank correctly, so this field will be - // omitted when the data for it does not exist. - Rank string `json:"rank,omitempty"` - - // Authorship is the verbatim authorship of the name. - Authorship string `json:"authorship,omitempty"` - - // Bacteria is not nil if the input name has a genus - // that is registered as bacterial. Possible - // values are "maybe" - if the genus has homonyms in other groups - // and "yes" if GNparser dictionary does not detect any homonyms - // - // The bacterial names often contain strain information which are - // not parseable and are placed into the "tail" field. - Bacteria string `json:"bacteria,omitempty"` - - // Candidatus indicates that the parsed string is a candidatus bacterial name. - Candidatus bool `json:"candidatus,omitempty"` - - // Virus is set to true in case if name is not parsed, and probably - // belongs to a wide variety of sub-cellular entities like - // - // - viruses - // - plasmids - // - prions - // - RNA - // - DNA - // - // Viruses are the vast majority in this group of names, - // as a result they gave (very imprecise) name to - // the field. - // - // We do plan to create a parser for viruses at some point, - // which will expand this group into more precise categories. - Virus bool `json:"virus,omitempty"` - - // Cultivar is true if a name was parsed as a cultivar. - Cultivar bool `json:"cultivar,omitempty"` - - // DaggerChar if true if a name-string includes '†' rune. - // This rune might mean a fossil, or be indication of the clade extinction. - DaggerChar bool `json:"daggerChar,omitempty"` - - // Hybrid is a string representation of a hybrid type. - // - // - a non-categorized hybrid - // - named hybrid - // - notho- hybrid - // - hybrid formula - Hybrid string `json:"hybrid,omitempty"` - - // GraftChimera is a string representation of graft chimera. - // - // - a non-categorized graft chimera - // - named graft chimera - // - graft chimera formula - GraftChimera string `json:"graftchimera,omitempty"` - - // Surrogate is a string repsresentation of a surrogate type. - - // - a non-categorized surrogates - // - surrogate names from BOLD project - // - comparisons (Homo cf. sapiens) - // - approximations (names for specimen that not fully identified) - Surrogate string `json:"surrogate,omitempty"` - - // Tail is an unparseable tail of a name. It might contain "junk", - // annotations, malformed parts of a scientific name, taxonomic concept - // indications, bacterial strains etc. If there is an unparseable tail, the - // quality of the name-parsing is set to the worst category. - Tail string `json:"tail,omitempty"` - - // Uninomial represents the single name used for uninomial nomenclature, - // typically applied to higher taxonomic ranks (e.g., family or order names - // like "Asteraceae"). This field is populated only for uninomial names and - // omitted otherwise. - Uninomial string `json:"uninomial,omitempty"` - - // Genus specifies the genus part of a binomial or trinomial scientific name - // (e.g., "Quercus" in "Quercus robur"). This field is empty if the name is - // uninomial. - Genus string `json:"genus,omitempty"` - - // Subgenus indicates the infrageneric epithet when present. - // This field is omitted if not applicable. - Subgenus string `json:"infragenericEpithet,omitempty"` - - // Species is the specific epithet of a binomial or trinomial. - Species string `json:"specificEpithet,omitempty"` - - // Infraspecies is the infraspecificEpither of trinomials (names with - // cardinality 3). We do not provide details for names with higher - // cardinality. - Infraspecies string `json:"infraspecificEpithet,omitempty"` - - // CultivarEpithet contains the cultivar name for cultivated plant varieties - // (e.g., "Golden Delicious" in "Malus domestica 'Golden Delicious'"). This - // field is populated only for names that include a cultivar designation. - CultivarEpithet string `json:"cultivarEpithet,omitempty"` - - // Notho denotes the hybrid status of a name, indicating whether it is a - // hybrid (e.g., "nothosubsp." or "nothovar." in "Salvia × sylvestris"). This - // field is empty if not given. - Notho string `json:"notho,omitempty"` - - // CombinationAuthorship provides the authorship for the current combination - // of the name, typically the authors who transferred the species to a new - // genus. (e.g., "K." in "Aus bus (L.) K."). This field is - // omitted if no combination authorship is specified. - CombinationAuthorship string `json:"combinationAuthorship,omitempty"` - - // CombinationExAuthorship captures the "ex" part of the combination - // authorship (e.g., "ex DC." in "Quercus robur L. ex DC."). This field is - // empty if no "ex" authorship exists. - CombinationExAuthorship string `json:"combinationExAuthorship,omitempty"` - - // CombinationAuthorshipYear records the year associated with the combination - // authorship, if provided (e.g., "1754" in "Homo sapiens (L.) K. 1753"). - // This field is omitted if the year is not specified. - CombinationAuthorshipYear string `json:"combinationAuthorshipYear,omitempty"` - - // BasionymAuthorship identifies the authorship of the original combination - // of the name (e.g., "Mill." in "Quercus robur (Mill.) L." where Mill. is - // the original author). This field is populated only if basionym authorship - // is present. - BasionymAuthorship string `json:"basionymAuthorship,omitempty"` - - // BasionymExAuthorship specifies the "ex" part of the basionym authorship, - // if applicable (e.g., "ex Torr." in "Pinus ponderosa Douglas ex Torr."). - // This field is empty when no "ex" basionym authorship is provided. - BasionymExAuthorship string `json:"basionymExAuthorship,omitempty"` - - // BasionymAuthorshipYear indicates the year tied to the basionym authorship - // (e.g., "1820" in "Pinus ponderosa Douglas, 1820"). This field is included - // only when the basionym year is explicitly stated. - BasionymAuthorshipYear string `json:"basionymAuthorshipYear,omitempty"` - - // VerbatimID is a UUID v5 generated from the verbatim value of the - // input name-string. Every unique string always generates the same - // UUID. - VerbatimID string `json:"id"` - - // ParserVersion is the version number of the GNparser. - ParserVersion string `json:"parserVersion"` -} - // Canonical are simplified forms of a name-string more suitable for // matching and comparing name-strings than the verbatim version. type Canonical struct { @@ -452,56 +241,3 @@ type Year struct { // a question mark "188?", by parentheses "(1888)". IsApproximate bool `json:"isApproximate,omitempty"` } - -func (p Parsed) Flatten() ParsedFlat { - res := ParsedFlat{ - Parsed: p.Parsed, - NomCode: p.NomCode, - ParseQuality: p.ParseQuality, - Verbatim: p.Verbatim, - Normalized: p.Normalized, - Cardinality: p.Cardinality, - Rank: p.Rank, - Bacteria: p.Bacteria.String(), - Candidatus: p.Candidatus, - Virus: p.Virus, - Cultivar: p.Cultivar, - DaggerChar: p.DaggerChar, - Hybrid: p.Hybrid.String(), - GraftChimera: p.GraftChimera.String(), - Surrogate: p.Surrogate.String(), - Tail: p.Tail, - VerbatimID: p.VerbatimID, - ParserVersion: p.ParserVersion, - } - if !p.Parsed { - return res - } - - res.CanonicalSimple = p.Canonical.Simple - res.CanonicalFull = p.Canonical.Full - res.CanonicalStemmed = p.Canonical.Stemmed - - if p.Authorship != nil { - au := p.Authorship - res.Authorship = au.Verbatim - - } - - switch detail := p.Details.(type) { - case DetailsUninomial: - res.Uninomial = detail.Uninomial.Value - case DetailsSpecies: - res.Genus = detail.Species.Genus - res.Subgenus = detail.Species.Subgenus - res.Species = detail.Species.Species - case DetailsInfraspecies: - if len(detail.Infraspecies.Infraspecies) == 1 { - res.Genus = detail.Infraspecies.Genus - res.Species = detail.Infraspecies.Species.Species - res.Rank = detail.Infraspecies.Infraspecies[0].Rank - res.Infraspecies = detail.Infraspecies.Infraspecies[0].Value - } - } - return res -}